- float acc = 0.0;
- float *cf = &coeffs[pd[x].startcoeff];
- unsigned sx;
-
- for (sx = pd[x].start; sx <= pd[x].end; ++sx) {
- acc += sptr[sx] * *cf++;
- }
+ float acc;
+ static const float low = 0.0, high = 255.0;
+ asm (
+ "pxor %0, %0 \n"
+ "xor %%eax, %%eax \n"
+ ".lbl2: \n"
+ "movups (%2,%%eax),%%xmm1 \n"
+ "movups (%1,%%eax),%%xmm2 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "addps %%xmm1,%0 \n"
+ "addl $16,%%eax \n"
+ "dec %3 \n"
+ "jnz .lbl2 \n"
+ "haddps %0,%0 \n"
+ "haddps %0,%0 \n"
+ "maxss %4,%0 \n"
+ "minss %5,%0 \n"
+ : "=x" (acc)
+ : "r" (&coeffs[pd[x].startcoeff]),
+ "r" (&sptr[pd[x].start]),
+ "r" ((pd[x].end - pd[x].start + 1)/4),
+ "m" (low),
+ "m" (high)
+ : "eax", "xmm1", "xmm2"
+ );