- if(((unsigned long)from) & 15)
- /* if SRC is misaligned */
- for(; i>0; i--)
- {
- __asm__ __volatile__ (
- PREFETCH" 320(%0)\n"
- "movups (%0), %%xmm0\n"
- "movups 16(%0), %%xmm1\n"
- "movups 32(%0), %%xmm2\n"
- "movups 48(%0), %%xmm3\n"
- "movntps %%xmm0, (%1)\n"
- "movntps %%xmm1, 16(%1)\n"
- "movntps %%xmm2, 32(%1)\n"
- "movntps %%xmm3, 48(%1)\n"
- :: "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
- }
- else
- /*
- Only if SRC is aligned on 16-byte boundary.
- It allows to use movaps instead of movups, which required data
- to be aligned or a general-protection exception (#GP) is generated.
- */
- for(; i>0; i--)
- {
- __asm__ __volatile__ (
- PREFETCH" 320(%0)\n"
- "movaps (%0), %%xmm0\n"
- "movaps 16(%0), %%xmm1\n"
- "movaps 32(%0), %%xmm2\n"
- "movaps 48(%0), %%xmm3\n"
- "movntps %%xmm0, (%1)\n"
- "movntps %%xmm1, 16(%1)\n"
- "movntps %%xmm2, 32(%1)\n"
- "movntps %%xmm3, 48(%1)\n"
- :: "r" (from), "r" (to) : "memory");
- ((const unsigned char *)from)+=64;
- ((unsigned char *)to)+=64;
- }
+ if(((unsigned long)from) & 15)
+ /* if SRC is misaligned */
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ PREFETCH" 320(%0)\n"
+ "movups (%0), %%xmm0\n"
+ "movups 16(%0), %%xmm1\n"
+ "movups 32(%0), %%xmm2\n"
+ "movups 48(%0), %%xmm3\n"
+ "movntps %%xmm0, (%1)\n"
+ "movntps %%xmm1, 16(%1)\n"
+ "movntps %%xmm2, 32(%1)\n"
+ "movntps %%xmm3, 48(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ ((const unsigned char *)from)+=64;
+ ((unsigned char *)to)+=64;
+ }
+ else
+ /*
+ Only if SRC is aligned on 16-byte boundary.
+ It allows to use movaps instead of movups, which required data
+ to be aligned or a general-protection exception (#GP) is generated.
+ */
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ PREFETCH" 320(%0)\n"
+ "movaps (%0), %%xmm0\n"
+ "movaps 16(%0), %%xmm1\n"
+ "movaps 32(%0), %%xmm2\n"
+ "movaps 48(%0), %%xmm3\n"
+ "movntps %%xmm0, (%1)\n"
+ "movntps %%xmm1, 16(%1)\n"
+ "movntps %%xmm2, 32(%1)\n"
+ "movntps %%xmm3, 48(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ ((const unsigned char *)from)+=64;
+ ((unsigned char *)to)+=64;
+ }
- "movq (%0), %%mm0\n"
- "movq 8(%0), %%mm1\n"
- "movq 16(%0), %%mm2\n"
- "movq 24(%0), %%mm3\n"
- "movq 32(%0), %%mm4\n"
- "movq 40(%0), %%mm5\n"
- "movq 48(%0), %%mm6\n"
- "movq 56(%0), %%mm7\n"
- MOVNTQ" %%mm0, (%1)\n"
- MOVNTQ" %%mm1, 8(%1)\n"
- MOVNTQ" %%mm2, 16(%1)\n"
- MOVNTQ" %%mm3, 24(%1)\n"
- MOVNTQ" %%mm4, 32(%1)\n"
- MOVNTQ" %%mm5, 40(%1)\n"
- MOVNTQ" %%mm6, 48(%1)\n"
- MOVNTQ" %%mm7, 56(%1)\n"
- :: "r" (from), "r" (to) : "memory");
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "movq 32(%0), %%mm4\n"
+ "movq 40(%0), %%mm5\n"
+ "movq 48(%0), %%mm6\n"
+ "movq 56(%0), %%mm7\n"
+ MOVNTQ" %%mm0, (%1)\n"
+ MOVNTQ" %%mm1, 8(%1)\n"
+ MOVNTQ" %%mm2, 16(%1)\n"
+ MOVNTQ" %%mm3, 24(%1)\n"
+ MOVNTQ" %%mm4, 32(%1)\n"
+ MOVNTQ" %%mm5, 40(%1)\n"
+ MOVNTQ" %%mm6, 48(%1)\n"
+ MOVNTQ" %%mm7, 56(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
- if(i>=BLOCK_SIZE/64)
- asm volatile(
- "xorl %%eax, %%eax \n\t"
- ".balign 16 \n\t"
- "1: \n\t"
- "movl (%0, %%eax), %%ebx \n\t"
- "movl 32(%0, %%eax), %%ebx \n\t"
- "movl 64(%0, %%eax), %%ebx \n\t"
- "movl 96(%0, %%eax), %%ebx \n\t"
- "addl $128, %%eax \n\t"
- "cmpl %3, %%eax \n\t"
- " jb 1b \n\t"
-
- "xorl %%eax, %%eax \n\t"
-
- ".balign 16 \n\t"
- "2: \n\t"
- "movq (%0, %%eax), %%mm0\n"
- "movq 8(%0, %%eax), %%mm1\n"
- "movq 16(%0, %%eax), %%mm2\n"
- "movq 24(%0, %%eax), %%mm3\n"
- "movq 32(%0, %%eax), %%mm4\n"
- "movq 40(%0, %%eax), %%mm5\n"
- "movq 48(%0, %%eax), %%mm6\n"
- "movq 56(%0, %%eax), %%mm7\n"
- MOVNTQ" %%mm0, (%1, %%eax)\n"
- MOVNTQ" %%mm1, 8(%1, %%eax)\n"
- MOVNTQ" %%mm2, 16(%1, %%eax)\n"
- MOVNTQ" %%mm3, 24(%1, %%eax)\n"
- MOVNTQ" %%mm4, 32(%1, %%eax)\n"
- MOVNTQ" %%mm5, 40(%1, %%eax)\n"
- MOVNTQ" %%mm6, 48(%1, %%eax)\n"
- MOVNTQ" %%mm7, 56(%1, %%eax)\n"
- "addl $64, %%eax \n\t"
- "cmpl %3, %%eax \n\t"
- "jb 2b \n\t"
+ if(i>=BLOCK_SIZE/64)
+ asm volatile(
+ "xorl %%eax, %%eax \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ "movl (%0, %%eax), %%ebx \n\t"
+ "movl 32(%0, %%eax), %%ebx \n\t"
+ "movl 64(%0, %%eax), %%ebx \n\t"
+ "movl 96(%0, %%eax), %%ebx \n\t"
+ "addl $128, %%eax \n\t"
+ "cmpl %3, %%eax \n\t"
+ " jb 1b \n\t"
+
+ "xorl %%eax, %%eax \n\t"
+
+ ".balign 16 \n\t"
+ "2: \n\t"
+ "movq (%0, %%eax), %%mm0\n"
+ "movq 8(%0, %%eax), %%mm1\n"
+ "movq 16(%0, %%eax), %%mm2\n"
+ "movq 24(%0, %%eax), %%mm3\n"
+ "movq 32(%0, %%eax), %%mm4\n"
+ "movq 40(%0, %%eax), %%mm5\n"
+ "movq 48(%0, %%eax), %%mm6\n"
+ "movq 56(%0, %%eax), %%mm7\n"
+ MOVNTQ" %%mm0, (%1, %%eax)\n"
+ MOVNTQ" %%mm1, 8(%1, %%eax)\n"
+ MOVNTQ" %%mm2, 16(%1, %%eax)\n"
+ MOVNTQ" %%mm3, 24(%1, %%eax)\n"
+ MOVNTQ" %%mm4, 32(%1, %%eax)\n"
+ MOVNTQ" %%mm5, 40(%1, %%eax)\n"
+ MOVNTQ" %%mm6, 48(%1, %%eax)\n"
+ MOVNTQ" %%mm7, 56(%1, %%eax)\n"
+ "addl $64, %%eax \n\t"
+ "cmpl %3, %%eax \n\t"
+ "jb 2b \n\t"
- /* a few percent speedup on out of order executing CPUs */
- "movl %5, %%eax \n\t"
- "2: \n\t"
- "movl (%0), %%ebx \n\t"
- "movl (%0), %%ebx \n\t"
- "movl (%0), %%ebx \n\t"
- "movl (%0), %%ebx \n\t"
- "decl %%eax \n\t"
- " jnz 2b \n\t"
+ /* a few percent speedup on out of order executing CPUs */
+ "movl %5, %%eax \n\t"
+ "2: \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "movl (%0), %%ebx \n\t"
+ "decl %%eax \n\t"
+ " jnz 2b \n\t"
- "movq (%0), %%mm0\n"
- "movq 8(%0), %%mm1\n"
- "movq 16(%0), %%mm2\n"
- "movq 24(%0), %%mm3\n"
- "movq 32(%0), %%mm4\n"
- "movq 40(%0), %%mm5\n"
- "movq 48(%0), %%mm6\n"
- "movq 56(%0), %%mm7\n"
- MOVNTQ" %%mm0, (%1)\n"
- MOVNTQ" %%mm1, 8(%1)\n"
- MOVNTQ" %%mm2, 16(%1)\n"
- MOVNTQ" %%mm3, 24(%1)\n"
- MOVNTQ" %%mm4, 32(%1)\n"
- MOVNTQ" %%mm5, 40(%1)\n"
- MOVNTQ" %%mm6, 48(%1)\n"
- MOVNTQ" %%mm7, 56(%1)\n"
- :: "r" (from), "r" (to) : "memory");
- from = (const void *) (((const unsigned char *)from)+64);
- to = (void *) (((unsigned char *)to)+64);
- }
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "movq 32(%0), %%mm4\n"
+ "movq 40(%0), %%mm5\n"
+ "movq 48(%0), %%mm6\n"
+ "movq 56(%0), %%mm7\n"
+ MOVNTQ" %%mm0, (%1)\n"
+ MOVNTQ" %%mm1, 8(%1)\n"
+ MOVNTQ" %%mm2, 16(%1)\n"
+ MOVNTQ" %%mm3, 24(%1)\n"
+ MOVNTQ" %%mm4, 32(%1)\n"
+ MOVNTQ" %%mm5, 40(%1)\n"
+ MOVNTQ" %%mm6, 48(%1)\n"
+ MOVNTQ" %%mm7, 56(%1)\n"
+ :: "r" (from), "r" (to) : "memory");
+ from = (const void *) (((const unsigned char *)from)+64);
+ to = (void *) (((unsigned char *)to)+64);
+ }