]> git.sesse.net Git - ffmpeg/blobdiff - postproc/rgb2rgb_template.c
AltiVec hScale, all size patch by (Romain Dolbeau <dolbeaur at club-internet dot...
[ffmpeg] / postproc / rgb2rgb_template.c
index e299b0c12ecff20e0491208eade1d6b6bbeccfde..eda2ccc83a7a312e47a81e5805d6630e2c3ecd20 100644 (file)
@@ -241,17 +241,6 @@ static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned sr
     }
 }
 
-static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
-{
-       unsigned j,i,num_pixels=src_size/3;
-       for(i=0,j=0; j<num_pixels; i+=3,j+=3)
-       {
-               dst[j+0] = src[i+2];
-               dst[j+1] = src[i+1];
-               dst[j+2] = src[i+0];
-       }
-}
-
 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
 {
   register const uint8_t* s=src;
@@ -318,12 +307,46 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
 #ifdef HAVE_MMX
+       mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+       asm volatile(
+               "movq %3, %%mm5                 \n\t"
+               "movq %4, %%mm6                 \n\t"
+               "movq %5, %%mm7                 \n\t"
+               ".balign 16                     \n\t"
+               "1:                             \n\t"
+               PREFETCH" 32(%1)                \n\t"
+               "movd   (%1), %%mm0             \n\t"
+               "movd   4(%1), %%mm3            \n\t"
+               "punpckldq 8(%1), %%mm0         \n\t"
+               "punpckldq 12(%1), %%mm3        \n\t"
+               "movq %%mm0, %%mm1              \n\t"
+               "movq %%mm3, %%mm4              \n\t"
+               "pand %%mm6, %%mm0              \n\t"
+               "pand %%mm6, %%mm3              \n\t"
+               "pmaddwd %%mm7, %%mm0           \n\t"
+               "pmaddwd %%mm7, %%mm3           \n\t"
+               "pand %%mm5, %%mm1              \n\t"
+               "pand %%mm5, %%mm4              \n\t"
+               "por %%mm1, %%mm0               \n\t"   
+               "por %%mm4, %%mm3               \n\t"
+               "psrld $5, %%mm0                \n\t"
+               "pslld $11, %%mm3               \n\t"
+               "por %%mm3, %%mm0               \n\t"
+               MOVNTQ" %%mm0, (%0)             \n\t"
+               "addl $16, %1                   \n\t"
+               "addl $8, %0                    \n\t"
+               "cmpl %2, %1                    \n\t"
+               " jb 1b                         \n\t"
+               : "+r" (d), "+r"(s)
+               : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
+       );
+#else
        __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
        __asm __volatile(
            "movq       %0, %%mm7\n\t"
            "movq       %1, %%mm6\n\t"
            ::"m"(red_16mask),"m"(green_16mask));
-       mm_end = end - 15;
        while(s < mm_end)
        {
            __asm __volatile(
@@ -359,6 +382,7 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
                d += 4;
                s += 16;
        }
+#endif
        __asm __volatile(SFENCE:::"memory");
        __asm __volatile(EMMS:::"memory");
 #endif
@@ -441,12 +465,46 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
 #ifdef HAVE_MMX
+       mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+       asm volatile(
+               "movq %3, %%mm5                 \n\t"
+               "movq %4, %%mm6                 \n\t"
+               "movq %5, %%mm7                 \n\t"
+               ".balign 16                     \n\t"
+               "1:                             \n\t"
+               PREFETCH" 32(%1)                \n\t"
+               "movd   (%1), %%mm0             \n\t"
+               "movd   4(%1), %%mm3            \n\t"
+               "punpckldq 8(%1), %%mm0         \n\t"
+               "punpckldq 12(%1), %%mm3        \n\t"
+               "movq %%mm0, %%mm1              \n\t"
+               "movq %%mm3, %%mm4              \n\t"
+               "pand %%mm6, %%mm0              \n\t"
+               "pand %%mm6, %%mm3              \n\t"
+               "pmaddwd %%mm7, %%mm0           \n\t"
+               "pmaddwd %%mm7, %%mm3           \n\t"
+               "pand %%mm5, %%mm1              \n\t"
+               "pand %%mm5, %%mm4              \n\t"
+               "por %%mm1, %%mm0               \n\t"   
+               "por %%mm4, %%mm3               \n\t"
+               "psrld $6, %%mm0                \n\t"
+               "pslld $10, %%mm3               \n\t"
+               "por %%mm3, %%mm0               \n\t"
+               MOVNTQ" %%mm0, (%0)             \n\t"
+               "addl $16, %1                   \n\t"
+               "addl $8, %0                    \n\t"
+               "cmpl %2, %1                    \n\t"
+               " jb 1b                         \n\t"
+               : "+r" (d), "+r"(s)
+               : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
+       );
+#else
        __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
        __asm __volatile(
            "movq       %0, %%mm7\n\t"
            "movq       %1, %%mm6\n\t"
            ::"m"(red_15mask),"m"(green_15mask));
-       mm_end = end - 15;
        while(s < mm_end)
        {
            __asm __volatile(
@@ -482,6 +540,7 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
                d += 4;
                s += 16;
        }
+#endif
        __asm __volatile(SFENCE:::"memory");
        __asm __volatile(EMMS:::"memory");
 #endif
@@ -1281,9 +1340,15 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsign
        unsigned num_pixels = src_size >> 2;
        for(i=0; i<num_pixels; i++)
        {
-               dst[4*i + 0] = src[4*i + 2];
-               dst[4*i + 1] = src[4*i + 1];
-               dst[4*i + 2] = src[4*i + 0];
+#ifdef WORDS_BIGENDIAN  
+         dst[4*i + 1] = src[4*i + 3];
+         dst[4*i + 2] = src[4*i + 2];
+         dst[4*i + 3] = src[4*i + 1];
+#else
+         dst[4*i + 0] = src[4*i + 2];
+         dst[4*i + 1] = src[4*i + 1];
+         dst[4*i + 2] = src[4*i + 0];
+#endif
        }
 #endif
 }
@@ -1503,6 +1568,108 @@ static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
        RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
 }
 
+static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+       unsigned int width, unsigned int height,
+       int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
+{
+       unsigned y;
+       const unsigned chromWidth= width>>1;
+       for(y=0; y<height; y++)
+       {
+#ifdef HAVE_MMX
+//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
+               asm volatile(
+                       "xorl %%eax, %%eax              \n\t"
+                       ".balign 16                     \n\t"
+                       "1:                             \n\t"
+                       PREFETCH" 32(%1, %%eax, 2)      \n\t"
+                       PREFETCH" 32(%2, %%eax)         \n\t"
+                       PREFETCH" 32(%3, %%eax)         \n\t"
+                       "movq (%2, %%eax), %%mm0        \n\t" // U(0)
+                       "movq %%mm0, %%mm2              \n\t" // U(0)
+                       "movq (%3, %%eax), %%mm1        \n\t" // V(0)
+                       "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
+                       "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
+
+                       "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
+                       "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
+                       "movq %%mm0, %%mm4              \n\t" // Y(0)
+                       "movq %%mm2, %%mm6              \n\t" // Y(8)
+                       "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
+                       "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
+                       "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
+                       "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
+
+                       MOVNTQ" %%mm0, (%0, %%eax, 4)   \n\t"
+                       MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
+                       MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
+                       MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
+
+                       "addl $8, %%eax                 \n\t"
+                       "cmpl %4, %%eax                 \n\t"
+                       " jb 1b                         \n\t"
+                       ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
+                       : "%eax"
+               );
+#else
+//FIXME adapt the alpha asm code from yv12->yuy2
+
+#if __WORDSIZE >= 64
+               int i;
+               uint64_t *ldst = (uint64_t *) dst;
+               const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+               for(i = 0; i < chromWidth; i += 2){
+                       uint64_t k, l;
+                       k = uc[0] + (yc[0] << 8) +
+                           (vc[0] << 16) + (yc[1] << 24);
+                       l = uc[1] + (yc[2] << 8) +
+                           (vc[1] << 16) + (yc[3] << 24);
+                       *ldst++ = k + (l << 32);
+                       yc += 4;
+                       uc += 2;
+                       vc += 2;
+               }
+
+#else
+               int i, *idst = (int32_t *) dst;
+               const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
+               for(i = 0; i < chromWidth; i++){
+                       *idst++ = uc[0] + (yc[0] << 8) +
+                           (vc[0] << 16) + (yc[1] << 24);
+                       yc += 2;
+                       uc++;
+                       vc++;
+               }
+#endif
+#endif
+               if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
+               {
+                       usrc += chromStride;
+                       vsrc += chromStride;
+               }
+               ysrc += lumStride;
+               dst += dstStride;
+       }
+#ifdef HAVE_MMX
+asm(    EMMS" \n\t"
+        SFENCE" \n\t"
+        :::"memory");
+#endif
+}
+
+/**
+ *
+ * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
+ * problem for anyone then tell me, and ill fix it)
+ */
+static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+       unsigned int width, unsigned int height,
+       int lumStride, int chromStride, int dstStride)
+{
+       //FIXME interpolate chroma
+       RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
+}
+
 /**
  *
  * width should be a multiple of 16