compile with unrolled loops.
asm (
"pxor %1, %1 \n"
"xor %2, %2 \n"
asm (
"pxor %1, %1 \n"
"xor %2, %2 \n"
"movups (%4,%2),%%xmm1 \n"
"movups (%3,%2),%%xmm2 \n"
"mulps %%xmm2,%%xmm1 \n"
"addps %%xmm1,%1 \n"
"add $16,%2 \n"
"dec %5 \n"
"movups (%4,%2),%%xmm1 \n"
"movups (%3,%2),%%xmm2 \n"
"mulps %%xmm2,%%xmm1 \n"
"addps %%xmm1,%1 \n"
"add $16,%2 \n"
"dec %5 \n"
"haddps %1,%1 \n"
"haddps %1,%1 \n"
"maxss %6,%1 \n"
"haddps %1,%1 \n"
"haddps %1,%1 \n"
"maxss %6,%1 \n"
"pxor %%xmm3, %%xmm3 \n"
/* main loop */
"pxor %%xmm3, %%xmm3 \n"
/* main loop */
/* a zero is useful during unpacking */
"pxor %%xmm4, %%xmm4 \n"
/* a zero is useful during unpacking */
"pxor %%xmm4, %%xmm4 \n"
"add $4, %2 \n"
"add %3, %0 \n"
"dec %1 \n"
"add $4, %2 \n"
"add %3, %0 \n"
"dec %1 \n"
/* store the values */
"movaps %%xmm0, (%4) \n"
/* store the values */
"movaps %%xmm0, (%4) \n"