unsigned char ch;
for (x = 0; x < nw; ++x) {
#if USE_HORIZONTAL_SSE
+ v4sf acc = { 0.0f, 0.0f, 0.0f, 0.0f };
+ static const v4sf low = { 0.0f, 0.0f, 0.0f, 0.0f };
+ static const v4sf high = { 255.0f, 255.0f, 255.0f, 255.0f };
int result;
- float acc;
- long tmp;
- static const float low = 0.0, high = 255.0;
- __asm__ (
- "pxor %1, %1 \n"
- "xor %2, %2 \n"
- "0: \n"
- "movups (%4,%2),%%xmm1 \n"
- "movups (%3,%2),%%xmm2 \n"
- "mulps %%xmm2,%%xmm1 \n"
- "addps %%xmm1,%1 \n"
- "add $16,%2 \n"
- "dec %5 \n"
- "jnz 0b \n"
- "haddps %1,%1 \n"
- "haddps %1,%1 \n"
- "maxss %6,%1 \n"
- "minss %7,%1 \n"
- "cvtss2si %1,%0 \n"
- : "=r" (result),
- "=&x" (acc),
- "=&r" (tmp)
- : "r" (&coeffs[pd[x].startcoeff]),
- "r" (&sptr[pd[x].start]),
- "r" ((pd[x].end - pd[x].start + 1)/4),
- "m" (low),
- "m" (high)
- : "memory", "xmm1", "xmm2"
- );
+ int i;
+
+ const float *sptr_xmm = &sptr[pd[x].start];
+ const float *coeffptr = &coeffs[pd[x].startcoeff];
+ const int filter_len = (pd[x].end - pd[x].start + 1) / 4;
+
+ for (i = 0; i < filter_len; ++i) {
+ v4sf pixels = __builtin_ia32_loadups(&sptr_xmm[i * 4]);
+ v4sf coeffs = __builtin_ia32_loadups(&coeffptr[i * 4]);
+ acc = __builtin_ia32_addps(acc, __builtin_ia32_mulps(pixels, coeffs));
+ }
+ acc = __builtin_ia32_haddps(acc, acc);
+ acc = __builtin_ia32_haddps(acc, acc);
+ acc = __builtin_ia32_maxss(acc, low);
+ acc = __builtin_ia32_minss(acc, high);
+ result = __builtin_ia32_cvtss2si(acc);
*dptr++ = (unsigned char)result;
#else