X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=modules%2Fmisc%2Fmemcpy%2Ffastmemcpy.h;h=f4546e5752678153d5cda5ac9c0ddeb164e09b6e;hb=6ee1e193fd896ab9a4729fde14f009d9ce629815;hp=4465a9510cfd1b08f771dc486322df75d60f5c4e;hpb=3305b049e7f587b23359a1c9047fb5763d19c1dc;p=vlc diff --git a/modules/misc/memcpy/fastmemcpy.h b/modules/misc/memcpy/fastmemcpy.h index 4465a9510c..f4546e5752 100644 --- a/modules/misc/memcpy/fastmemcpy.h +++ b/modules/misc/memcpy/fastmemcpy.h @@ -11,7 +11,7 @@ * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -116,13 +116,13 @@ If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. {\ register unsigned long int dummy;\ __asm__ __volatile__(\ - "rep; movsb"\ - :"=&D"(to), "=&S"(from), "=&c"(dummy)\ + "rep; movsb"\ + :"=&D"(to), "=&S"(from), "=&c"(dummy)\ /* It's most portable way to notify compiler */\ /* that edi, esi and ecx are clobbered in asm block. */\ /* Thanks to A'rpi for hint!!! */\ :"0" (to), "1" (from),"2" (n)\ - : "memory");\ + : "memory");\ } #ifdef HAVE_SSE @@ -155,45 +155,45 @@ __asm__ __volatile__(\ void * fast_memcpy(void * to, const void * from, size_t len) { - void *retval; - size_t i; - retval = to; + void *retval; + size_t i; + retval = to; #ifdef STATISTICS - { - static int freq[33]; - static int t=0; - int i; - for(i=0; len>(1<(1<= MIN_LEN) - { - register unsigned long int delta; + { + register unsigned long int delta; /* Align destinition to MMREG_SIZE -boundary */ delta = ((unsigned long int)to)&(MMREG_SIZE-1); if(delta) - { - delta=MMREG_SIZE-delta; - len -= delta; - small_memcpy(to, from, delta); - } - i = len >> 6; /* len/64 */ - len&=63; + { + delta=MMREG_SIZE-delta; + len -= delta; + small_memcpy(to, from, delta); + } + i = len >> 6; /* len/64 */ + len&=63; /* This algorithm is top effective when the code consequently reads and writes blocks which have size of cache line. @@ -204,182 +204,182 @@ void * fast_memcpy(void * to, const void * from, size_t len) processor's decoders, but it's not always possible. */ #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */ - if(((unsigned long)from) & 15) - /* if SRC is misaligned */ - for(; i>0; i--) - { - __asm__ __volatile__ ( - PREFETCH" 320(%0)\n" - "movups (%0), %%xmm0\n" - "movups 16(%0), %%xmm1\n" - "movups 32(%0), %%xmm2\n" - "movups 48(%0), %%xmm3\n" - "movntps %%xmm0, (%1)\n" - "movntps %%xmm1, 16(%1)\n" - "movntps %%xmm2, 32(%1)\n" - "movntps %%xmm3, 48(%1)\n" - :: "r" (from), "r" (to) : "memory"); - ((const unsigned char *)from)+=64; - ((unsigned char *)to)+=64; - } - else - /* - Only if SRC is aligned on 16-byte boundary. - It allows to use movaps instead of movups, which required data - to be aligned or a general-protection exception (#GP) is generated. - */ - for(; i>0; i--) - { - __asm__ __volatile__ ( - PREFETCH" 320(%0)\n" - "movaps (%0), %%xmm0\n" - "movaps 16(%0), %%xmm1\n" - "movaps 32(%0), %%xmm2\n" - "movaps 48(%0), %%xmm3\n" - "movntps %%xmm0, (%1)\n" - "movntps %%xmm1, 16(%1)\n" - "movntps %%xmm2, 32(%1)\n" - "movntps %%xmm3, 48(%1)\n" - :: "r" (from), "r" (to) : "memory"); - ((const unsigned char *)from)+=64; - ((unsigned char *)to)+=64; - } + if(((unsigned long)from) & 15) + /* if SRC is misaligned */ + for(; i>0; i--) + { + __asm__ __volatile__ ( + PREFETCH" 320(%0)\n" + "movups (%0), %%xmm0\n" + "movups 16(%0), %%xmm1\n" + "movups 32(%0), %%xmm2\n" + "movups 48(%0), %%xmm3\n" + "movntps %%xmm0, (%1)\n" + "movntps %%xmm1, 16(%1)\n" + "movntps %%xmm2, 32(%1)\n" + "movntps %%xmm3, 48(%1)\n" + :: "r" (from), "r" (to) : "memory"); + ((const unsigned char *)from)+=64; + ((unsigned char *)to)+=64; + } + else + /* + Only if SRC is aligned on 16-byte boundary. + It allows to use movaps instead of movups, which required data + to be aligned or a general-protection exception (#GP) is generated. + */ + for(; i>0; i--) + { + __asm__ __volatile__ ( + PREFETCH" 320(%0)\n" + "movaps (%0), %%xmm0\n" + "movaps 16(%0), %%xmm1\n" + "movaps 32(%0), %%xmm2\n" + "movaps 48(%0), %%xmm3\n" + "movntps %%xmm0, (%1)\n" + "movntps %%xmm1, 16(%1)\n" + "movntps %%xmm2, 32(%1)\n" + "movntps %%xmm3, 48(%1)\n" + :: "r" (from), "r" (to) : "memory"); + ((const unsigned char *)from)+=64; + ((unsigned char *)to)+=64; + } #else - /* Align destination at BLOCK_SIZE boundary */ - for(; ((uintptr_t)to & (BLOCK_SIZE-1)) && i>0; i--) - { - __asm__ __volatile__ ( + /* Align destination at BLOCK_SIZE boundary */ + for(; ((uintptr_t)to & (BLOCK_SIZE-1)) && i>0; i--) + { + __asm__ __volatile__ ( #ifndef HAVE_MMX1 - PREFETCH" 320(%0)\n" + PREFETCH" 320(%0)\n" #endif - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "movq 16(%0), %%mm2\n" - "movq 24(%0), %%mm3\n" - "movq 32(%0), %%mm4\n" - "movq 40(%0), %%mm5\n" - "movq 48(%0), %%mm6\n" - "movq 56(%0), %%mm7\n" - MOVNTQ" %%mm0, (%1)\n" - MOVNTQ" %%mm1, 8(%1)\n" - MOVNTQ" %%mm2, 16(%1)\n" - MOVNTQ" %%mm3, 24(%1)\n" - MOVNTQ" %%mm4, 32(%1)\n" - MOVNTQ" %%mm5, 40(%1)\n" - MOVNTQ" %%mm6, 48(%1)\n" - MOVNTQ" %%mm7, 56(%1)\n" - :: "r" (from), "r" (to) : "memory"); + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + MOVNTQ" %%mm0, (%1)\n" + MOVNTQ" %%mm1, 8(%1)\n" + MOVNTQ" %%mm2, 16(%1)\n" + MOVNTQ" %%mm3, 24(%1)\n" + MOVNTQ" %%mm4, 32(%1)\n" + MOVNTQ" %%mm5, 40(%1)\n" + MOVNTQ" %%mm6, 48(%1)\n" + MOVNTQ" %%mm7, 56(%1)\n" + :: "r" (from), "r" (to) : "memory"); from = (const void *) (((const unsigned char *)from)+64); - to = (void *) (((unsigned char *)to)+64); - } + to = (void *) (((unsigned char *)to)+64); + } -/* printf(" %p %p\n", (uintptr_t)from&1023, (uintptr_t)to&1023); */ - /* Pure Assembly cuz gcc is a bit unpredictable ;) */ +/* printf(" %p %p\n", (uintptr_t)from&1023, (uintptr_t)to&1023); */ + /* Pure Assembly cuz gcc is a bit unpredictable ;) */ # if 0 - if(i>=BLOCK_SIZE/64) - asm volatile( - "xorl %%eax, %%eax \n\t" - ".balign 16 \n\t" - "1: \n\t" - "movl (%0, %%eax), %%ebx \n\t" - "movl 32(%0, %%eax), %%ebx \n\t" - "movl 64(%0, %%eax), %%ebx \n\t" - "movl 96(%0, %%eax), %%ebx \n\t" - "addl $128, %%eax \n\t" - "cmpl %3, %%eax \n\t" - " jb 1b \n\t" - - "xorl %%eax, %%eax \n\t" - - ".balign 16 \n\t" - "2: \n\t" - "movq (%0, %%eax), %%mm0\n" - "movq 8(%0, %%eax), %%mm1\n" - "movq 16(%0, %%eax), %%mm2\n" - "movq 24(%0, %%eax), %%mm3\n" - "movq 32(%0, %%eax), %%mm4\n" - "movq 40(%0, %%eax), %%mm5\n" - "movq 48(%0, %%eax), %%mm6\n" - "movq 56(%0, %%eax), %%mm7\n" - MOVNTQ" %%mm0, (%1, %%eax)\n" - MOVNTQ" %%mm1, 8(%1, %%eax)\n" - MOVNTQ" %%mm2, 16(%1, %%eax)\n" - MOVNTQ" %%mm3, 24(%1, %%eax)\n" - MOVNTQ" %%mm4, 32(%1, %%eax)\n" - MOVNTQ" %%mm5, 40(%1, %%eax)\n" - MOVNTQ" %%mm6, 48(%1, %%eax)\n" - MOVNTQ" %%mm7, 56(%1, %%eax)\n" - "addl $64, %%eax \n\t" - "cmpl %3, %%eax \n\t" - "jb 2b \n\t" + if(i>=BLOCK_SIZE/64) + asm volatile( + "xorl %%eax, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movl (%0, %%eax), %%ebx \n\t" + "movl 32(%0, %%eax), %%ebx \n\t" + "movl 64(%0, %%eax), %%ebx \n\t" + "movl 96(%0, %%eax), %%ebx \n\t" + "addl $128, %%eax \n\t" + "cmpl %3, %%eax \n\t" + " jb 1b \n\t" + + "xorl %%eax, %%eax \n\t" + + ".balign 16 \n\t" + "2: \n\t" + "movq (%0, %%eax), %%mm0\n" + "movq 8(%0, %%eax), %%mm1\n" + "movq 16(%0, %%eax), %%mm2\n" + "movq 24(%0, %%eax), %%mm3\n" + "movq 32(%0, %%eax), %%mm4\n" + "movq 40(%0, %%eax), %%mm5\n" + "movq 48(%0, %%eax), %%mm6\n" + "movq 56(%0, %%eax), %%mm7\n" + MOVNTQ" %%mm0, (%1, %%eax)\n" + MOVNTQ" %%mm1, 8(%1, %%eax)\n" + MOVNTQ" %%mm2, 16(%1, %%eax)\n" + MOVNTQ" %%mm3, 24(%1, %%eax)\n" + MOVNTQ" %%mm4, 32(%1, %%eax)\n" + MOVNTQ" %%mm5, 40(%1, %%eax)\n" + MOVNTQ" %%mm6, 48(%1, %%eax)\n" + MOVNTQ" %%mm7, 56(%1, %%eax)\n" + "addl $64, %%eax \n\t" + "cmpl %3, %%eax \n\t" + "jb 2b \n\t" #if CONFUSION_FACTOR > 0 - /* a few percent speedup on out of order executing CPUs */ - "movl %5, %%eax \n\t" - "2: \n\t" - "movl (%0), %%ebx \n\t" - "movl (%0), %%ebx \n\t" - "movl (%0), %%ebx \n\t" - "movl (%0), %%ebx \n\t" - "decl %%eax \n\t" - " jnz 2b \n\t" + /* a few percent speedup on out of order executing CPUs */ + "movl %5, %%eax \n\t" + "2: \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "decl %%eax \n\t" + " jnz 2b \n\t" #endif - "xorl %%eax, %%eax \n\t" - "addl %3, %0 \n\t" - "addl %3, %1 \n\t" - "subl %4, %2 \n\t" - "cmpl %4, %2 \n\t" - " jae 1b \n\t" - : "+r" (from), "+r" (to), "+r" (i) - : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR) - : "%eax", "%ebx" - ); + "xorl %%eax, %%eax \n\t" + "addl %3, %0 \n\t" + "addl %3, %1 \n\t" + "subl %4, %2 \n\t" + "cmpl %4, %2 \n\t" + " jae 1b \n\t" + : "+r" (from), "+r" (to), "+r" (i) + : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR) + : "%eax", "%ebx" + ); #endif - for(; i>0; i--) - { - __asm__ __volatile__ ( + for(; i>0; i--) + { + __asm__ __volatile__ ( #ifndef HAVE_MMX1 - PREFETCH" 320(%0)\n" + PREFETCH" 320(%0)\n" #endif - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "movq 16(%0), %%mm2\n" - "movq 24(%0), %%mm3\n" - "movq 32(%0), %%mm4\n" - "movq 40(%0), %%mm5\n" - "movq 48(%0), %%mm6\n" - "movq 56(%0), %%mm7\n" - MOVNTQ" %%mm0, (%1)\n" - MOVNTQ" %%mm1, 8(%1)\n" - MOVNTQ" %%mm2, 16(%1)\n" - MOVNTQ" %%mm3, 24(%1)\n" - MOVNTQ" %%mm4, 32(%1)\n" - MOVNTQ" %%mm5, 40(%1)\n" - MOVNTQ" %%mm6, 48(%1)\n" - MOVNTQ" %%mm7, 56(%1)\n" - :: "r" (from), "r" (to) : "memory"); - from = (const void *) (((const unsigned char *)from)+64); - to = (void *) (((unsigned char *)to)+64); - } + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + MOVNTQ" %%mm0, (%1)\n" + MOVNTQ" %%mm1, 8(%1)\n" + MOVNTQ" %%mm2, 16(%1)\n" + MOVNTQ" %%mm3, 24(%1)\n" + MOVNTQ" %%mm4, 32(%1)\n" + MOVNTQ" %%mm5, 40(%1)\n" + MOVNTQ" %%mm6, 48(%1)\n" + MOVNTQ" %%mm7, 56(%1)\n" + :: "r" (from), "r" (to) : "memory"); + from = (const void *) (((const unsigned char *)from)+64); + to = (void *) (((unsigned char *)to)+64); + } #endif /* Have SSE */ #ifdef HAVE_MMX2 /* since movntq is weakly-ordered, a "sfence" - * is needed to become ordered again. */ - __asm__ __volatile__ ("sfence":::"memory"); + * is needed to become ordered again. */ + __asm__ __volatile__ ("sfence":::"memory"); #endif #ifndef HAVE_SSE - /* enables to use FPU */ - __asm__ __volatile__ (EMMS:::"memory"); + /* enables to use FPU */ + __asm__ __volatile__ (EMMS:::"memory"); #endif - } - /* - * Now do the tail of the block - */ - if(len) small_memcpy(to, from, len); - return retval; + } + /* + * Now do the tail of the block + */ + if(len) small_memcpy(to, from, len); + return retval; }