]> git.sesse.net Git - ffmpeg/blobdiff - postproc/rgb2rgb_template.c
init_put_bits changed
[ffmpeg] / postproc / rgb2rgb_template.c
index eb8137045a2b7b244fa40cef7282f800f8474c63..eda2ccc83a7a312e47a81e5805d6630e2c3ecd20 100644 (file)
@@ -241,17 +241,6 @@ static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned sr
     }
 }
 
-static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
-{
-       unsigned j,i,num_pixels=src_size/3;
-       for(i=0,j=0; j<num_pixels; i+=3,j+=3)
-       {
-               dst[j+0] = src[i+2];
-               dst[j+1] = src[i+1];
-               dst[j+2] = src[i+0];
-       }
-}
-
 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
 {
   register const uint8_t* s=src;
@@ -318,12 +307,46 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
 #ifdef HAVE_MMX
+       mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+       asm volatile(
+               "movq %3, %%mm5                 \n\t"
+               "movq %4, %%mm6                 \n\t"
+               "movq %5, %%mm7                 \n\t"
+               ".balign 16                     \n\t"
+               "1:                             \n\t"
+               PREFETCH" 32(%1)                \n\t"
+               "movd   (%1), %%mm0             \n\t"
+               "movd   4(%1), %%mm3            \n\t"
+               "punpckldq 8(%1), %%mm0         \n\t"
+               "punpckldq 12(%1), %%mm3        \n\t"
+               "movq %%mm0, %%mm1              \n\t"
+               "movq %%mm3, %%mm4              \n\t"
+               "pand %%mm6, %%mm0              \n\t"
+               "pand %%mm6, %%mm3              \n\t"
+               "pmaddwd %%mm7, %%mm0           \n\t"
+               "pmaddwd %%mm7, %%mm3           \n\t"
+               "pand %%mm5, %%mm1              \n\t"
+               "pand %%mm5, %%mm4              \n\t"
+               "por %%mm1, %%mm0               \n\t"   
+               "por %%mm4, %%mm3               \n\t"
+               "psrld $5, %%mm0                \n\t"
+               "pslld $11, %%mm3               \n\t"
+               "por %%mm3, %%mm0               \n\t"
+               MOVNTQ" %%mm0, (%0)             \n\t"
+               "addl $16, %1                   \n\t"
+               "addl $8, %0                    \n\t"
+               "cmpl %2, %1                    \n\t"
+               " jb 1b                         \n\t"
+               : "+r" (d), "+r"(s)
+               : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
+       );
+#else
        __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
        __asm __volatile(
            "movq       %0, %%mm7\n\t"
            "movq       %1, %%mm6\n\t"
            ::"m"(red_16mask),"m"(green_16mask));
-       mm_end = end - 15;
        while(s < mm_end)
        {
            __asm __volatile(
@@ -359,25 +382,15 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
                d += 4;
                s += 16;
        }
+#endif
        __asm __volatile(SFENCE:::"memory");
        __asm __volatile(EMMS:::"memory");
 #endif
        while(s < end)
        {
-#ifndef WORDS_BIGENDIAN
-               const int b= *s++;
-               const int g= *s++;
-               const int r= *s++;
-#else
-               const int a= *s++; /*skip*/
-               const int r= *s++;
-               const int g= *s++;
-               const int b= *s++;
-#endif         
-               *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
-#ifndef WORDS_BIGENDIAN
-               s++;
-#endif
+               const int src= *((uint32_t*)s)++;
+               *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
+//             *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
        }
 }
 
@@ -437,11 +450,8 @@ static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsign
 #endif
        while(s < end)
        {
-               const int r= *s++;
-               const int g= *s++;
-               const int b= *s++;
-               *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
-               s++;
+               const int src= *((uint32_t*)s)++;
+               *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
        }
 }
 
@@ -455,12 +465,46 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
 #ifdef HAVE_MMX
+       mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+       asm volatile(
+               "movq %3, %%mm5                 \n\t"
+               "movq %4, %%mm6                 \n\t"
+               "movq %5, %%mm7                 \n\t"
+               ".balign 16                     \n\t"
+               "1:                             \n\t"
+               PREFETCH" 32(%1)                \n\t"
+               "movd   (%1), %%mm0             \n\t"
+               "movd   4(%1), %%mm3            \n\t"
+               "punpckldq 8(%1), %%mm0         \n\t"
+               "punpckldq 12(%1), %%mm3        \n\t"
+               "movq %%mm0, %%mm1              \n\t"
+               "movq %%mm3, %%mm4              \n\t"
+               "pand %%mm6, %%mm0              \n\t"
+               "pand %%mm6, %%mm3              \n\t"
+               "pmaddwd %%mm7, %%mm0           \n\t"
+               "pmaddwd %%mm7, %%mm3           \n\t"
+               "pand %%mm5, %%mm1              \n\t"
+               "pand %%mm5, %%mm4              \n\t"
+               "por %%mm1, %%mm0               \n\t"   
+               "por %%mm4, %%mm3               \n\t"
+               "psrld $6, %%mm0                \n\t"
+               "pslld $10, %%mm3               \n\t"
+               "por %%mm3, %%mm0               \n\t"
+               MOVNTQ" %%mm0, (%0)             \n\t"
+               "addl $16, %1                   \n\t"
+               "addl $8, %0                    \n\t"
+               "cmpl %2, %1                    \n\t"
+               " jb 1b                         \n\t"
+               : "+r" (d), "+r"(s)
+               : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
+       );
+#else
        __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
        __asm __volatile(
            "movq       %0, %%mm7\n\t"
            "movq       %1, %%mm6\n\t"
            ::"m"(red_15mask),"m"(green_15mask));
-       mm_end = end - 15;
        while(s < mm_end)
        {
            __asm __volatile(
@@ -496,16 +540,14 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
                d += 4;
                s += 16;
        }
+#endif
        __asm __volatile(SFENCE:::"memory");
        __asm __volatile(EMMS:::"memory");
 #endif
        while(s < end)
        {
-               const int b= *s++;
-               const int g= *s++;
-               const int r= *s++;
-               *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
-               s++;
+               const int src= *((uint32_t*)s)++;
+               *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
        }
 }
 
@@ -565,11 +607,8 @@ static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsign
 #endif
        while(s < end)
        {
-               const int r= *s++;
-               const int g= *s++;
-               const int b= *s++;
-               *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
-               s++;
+               const int src= *((uint32_t*)s)++;
+               *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
        }
 }
 
@@ -1187,12 +1226,18 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned
 #endif
        while(s < end)
        {
+#if 0 //slightly slower on athlon
+               int bgr= *s++;
+               *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
+#else
+//FIXME this is very likely wrong for bigendian (and the following converters too)
                register uint16_t bgr;
                bgr = *s++;
                *d++ = (bgr&0x1F)<<3;
                *d++ = (bgr&0x3E0)>>2;
                *d++ = (bgr&0x7C00)>>7;
                *d++ = 0;
+#endif
        }
 }
 
@@ -1295,9 +1340,15 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsign
        unsigned num_pixels = src_size >> 2;
        for(i=0; i<num_pixels; i++)
        {
-               dst[4*i + 0] = src[4*i + 2];
-               dst[4*i + 1] = src[4*i + 1];
-               dst[4*i + 2] = src[4*i + 0];
+#ifdef WORDS_BIGENDIAN  
+         dst[4*i + 1] = src[4*i + 3];
+         dst[4*i + 2] = src[4*i + 2];
+         dst[4*i + 3] = src[4*i + 1];
+#else
+         dst[4*i + 0] = src[4*i + 2];
+         dst[4*i + 1] = src[4*i + 1];
+         dst[4*i + 2] = src[4*i + 0];
+#endif
        }
 #endif
 }
@@ -1517,6 +1568,108 @@ static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
 }
 
+static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+       unsigned int width, unsigned int height,
+       int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
+{
+       unsigned y;
+       const unsigned chromWidth= width>>1;
+       for(y=0; y<height; y++)
+       {
+#ifdef HAVE_MMX
+//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
+               asm volatile(
+                       "xorl %%eax, %%eax              \n\t"
+                       ".balign 16                     \n\t"
+                       "1:                             \n\t"
+                       PREFETCH" 32(%1, %%eax, 2)      \n\t"
+                       PREFETCH" 32(%2, %%eax)         \n\t"
+                       PREFETCH" 32(%3, %%eax)         \n\t"
+                       "movq (%2, %%eax), %%mm0        \n\t" // U(0)
+                       "movq %%mm0, %%mm2              \n\t" // U(0)
+                       "movq (%3, %%eax), %%mm1        \n\t" // V(0)
+                       "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
+                       "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
+
+                       "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
+                       "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
+                       "movq %%mm0, %%mm4              \n\t" // Y(0)
+                       "movq %%mm2, %%mm6              \n\t" // Y(8)
+                       "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
+                       "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
+                       "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
+                       "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
+
+                       MOVNTQ" %%mm0, (%0, %%eax, 4)   \n\t"
+                       MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
+                       MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
+                       MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
+
+                       "addl $8, %%eax                 \n\t"
+                       "cmpl %4, %%eax                 \n\t"
+                       " jb 1b                         \n\t"
+                       ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
+                       : "%eax"
+               );
+#else
+//FIXME adapt the alpha asm code from yv12->yuy2
+
+#if __WORDSIZE >= 64
+               int i;
+               uint64_t *ldst = (uint64_t *) dst;
+               const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+               for(i = 0; i < chromWidth; i += 2){
+                       uint64_t k, l;
+                       k = uc[0] + (yc[0] << 8) +
+                           (vc[0] << 16) + (yc[1] << 24);
+                       l = uc[1] + (yc[2] << 8) +
+                           (vc[1] << 16) + (yc[3] << 24);
+                       *ldst++ = k + (l << 32);
+                       yc += 4;
+                       uc += 2;
+                       vc += 2;
+               }
+
+#else
+               int i, *idst = (int32_t *) dst;
+               const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+               for(i = 0; i < chromWidth; i++){
+                       *idst++ = uc[0] + (yc[0] << 8) +
+                           (vc[0] << 16) + (yc[1] << 24);
+                       yc += 2;
+                       uc++;
+                       vc++;
+               }
+#endif
+#endif
+               if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
+               {
+                       usrc += chromStride;
+                       vsrc += chromStride;
+               }
+               ysrc += lumStride;
+               dst += dstStride;
+       }
+#ifdef HAVE_MMX
+asm(    EMMS" \n\t"
+        SFENCE" \n\t"
+        :::"memory");
+#endif
+}
+
+/**
+ *
+ * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
+ * problem for anyone then tell me, and ill fix it)
+ */
+static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+       unsigned int width, unsigned int height,
+       int lumStride, int chromStride, int dstStride)
+{
+       //FIXME interpolate chroma
+       RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
+}
+
 /**
  *
  * width should be a multiple of 16