]> git.sesse.net Git - ffmpeg/blobdiff - postproc/swscale_template.c
using fewer registers (fixes compilation bug hopefully)
[ffmpeg] / postproc / swscale_template.c
index 85e18a8dbd3a0a5930e37d5085a256ffc6f8569b..0bc0457717238f2a06f07568fcc2d8c3f783c164 100644 (file)
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
 */
-#define YSCALEYUV2RGBX \
+#define YSCALEYUV2PACKEDX \
                "xorl %%eax, %%eax              \n\t"\
                ".balign 16                     \n\t"\
                "1:                             \n\t"\
                "movl %1, %%edx                 \n\t" /* -chrFilterSize */\
-               "movl %3, %%ebx                 \n\t" /* chrMmxFilter+lumFilterSize */\
-               "movl %7, %%ecx                 \n\t" /* chrSrc+lumFilterSize */\
+               "movl %3, %%ebx                 \n\t" /* chrMmxFilter+chrFilterSize */\
+               "movl %7, %%ecx                 \n\t" /* chrSrc+chrFilterSize */\
                "pxor %%mm3, %%mm3              \n\t"\
                "pxor %%mm4, %%mm4              \n\t"\
                "2:                             \n\t"\
                "paddw %%mm5, %%mm7             \n\t"\
                "addl $1, %%edx                 \n\t"\
                " jnz 2b                        \n\t"\
-\
+
+
+#define YSCALEYUV2RGBX \
+               YSCALEYUV2PACKEDX\
                "psubw "MANGLE(w400)", %%mm3    \n\t" /* (U-128)8*/\
                "psubw "MANGLE(w400)", %%mm4    \n\t" /* (V-128)8*/\
                "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
 \
                "packuswb %%mm1, %%mm1          \n\t"
 
+#define YSCALEYUV2PACKED \
+               "movd %6, %%mm6                 \n\t" /*yalpha1*/\
+               "punpcklwd %%mm6, %%mm6         \n\t"\
+               "punpcklwd %%mm6, %%mm6         \n\t"\
+                "psraw $3, %%mm6               \n\t"\
+               "movq %%mm6, 3968(%2)           \n\t"\
+               "movd %7, %%mm5                 \n\t" /*uvalpha1*/\
+               "punpcklwd %%mm5, %%mm5         \n\t"\
+               "punpcklwd %%mm5, %%mm5         \n\t"\
+                "psraw $3, %%mm5               \n\t"\
+               "movq %%mm5, 3976(%2)           \n\t"\
+               "xorl %%eax, %%eax              \n\t"\
+               ".balign 16                     \n\t"\
+               "1:                             \n\t"\
+               "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
+               "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
+               "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
+               "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
+               "psubw %%mm3, %%mm2             \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
+               "psubw %%mm4, %%mm5             \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
+               "movq 3976(%2), %%mm0           \n\t"\
+               "pmulhw %%mm0, %%mm2            \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
+               "pmulhw %%mm0, %%mm5            \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
+               "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
+               "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+               "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
+               "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
+               "movq (%0, %%eax, 2), %%mm0     \n\t" /*buf0[eax]*/\
+               "movq (%1, %%eax, 2), %%mm1     \n\t" /*buf1[eax]*/\
+               "movq 8(%0, %%eax, 2), %%mm6    \n\t" /*buf0[eax]*/\
+               "movq 8(%1, %%eax, 2), %%mm7    \n\t" /*buf1[eax]*/\
+               "psubw %%mm1, %%mm0             \n\t" /* buf0[eax] - buf1[eax]*/\
+               "psubw %%mm7, %%mm6             \n\t" /* buf0[eax] - buf1[eax]*/\
+               "pmulhw 3968(%2), %%mm0         \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+               "pmulhw 3968(%2), %%mm6         \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
+               "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+               "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
+               "paddw %%mm0, %%mm1             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+               "paddw %%mm6, %%mm7             \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+                
 #define YSCALEYUV2RGB \
                "movd %6, %%mm6                 \n\t" /*yalpha1*/\
                "punpcklwd %%mm6, %%mm6         \n\t"\
                "packuswb %%mm6, %%mm5          \n\t"\
                "packuswb %%mm3, %%mm4          \n\t"\
                "pxor %%mm7, %%mm7              \n\t"
-
+                
+#define YSCALEYUV2PACKED1 \
+               "xorl %%eax, %%eax              \n\t"\
+               ".balign 16                     \n\t"\
+               "1:                             \n\t"\
+               "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
+               "movq 4096(%2, %%eax), %%mm4    \n\t" /* uvbuf0[eax+2048]*/\
+               "psraw $7, %%mm3                \n\t" \
+               "psraw $7, %%mm4                \n\t" \
+               "movq (%0, %%eax, 2), %%mm1     \n\t" /*buf0[eax]*/\
+               "movq 8(%0, %%eax, 2), %%mm7    \n\t" /*buf0[eax]*/\
+               "psraw $7, %%mm1                \n\t" \
+               "psraw $7, %%mm7                \n\t" \
+                
 #define YSCALEYUV2RGB1 \
                "xorl %%eax, %%eax              \n\t"\
                ".balign 16                     \n\t"\
                "packuswb %%mm3, %%mm4          \n\t"\
                "pxor %%mm7, %%mm7              \n\t"
 
+#define YSCALEYUV2PACKED1b \
+               "xorl %%eax, %%eax              \n\t"\
+               ".balign 16                     \n\t"\
+               "1:                             \n\t"\
+               "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
+               "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
+               "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
+               "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
+               "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
+               "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
+               "psrlw $8, %%mm3                \n\t" \
+               "psrlw $8, %%mm4                \n\t" \
+               "movq (%0, %%eax, 2), %%mm1     \n\t" /*buf0[eax]*/\
+               "movq 8(%0, %%eax, 2), %%mm7    \n\t" /*buf0[eax]*/\
+               "psraw $7, %%mm1                \n\t" \
+               "psraw $7, %%mm7                \n\t" 
+                
 // do vertical chrominance interpolation
 #define YSCALEYUV2RGB1b \
                "xorl %%eax, %%eax              \n\t"\
 #define WRITEBGR24 WRITEBGR24MMX
 #endif
 
+#define WRITEYUY2 \
+                       "packuswb %%mm3, %%mm3          \n\t"\
+                       "packuswb %%mm4, %%mm4          \n\t"\
+                       "packuswb %%mm7, %%mm1          \n\t"\
+                       "punpcklbw %%mm4, %%mm3         \n\t"\
+                       "movq %%mm1, %%mm7              \n\t"\
+                       "punpcklbw %%mm3, %%mm1         \n\t"\
+                       "punpckhbw %%mm3, %%mm7         \n\t"\
+\
+                       MOVNTQ(%%mm1, (%4, %%eax, 2))\
+                       MOVNTQ(%%mm7, 8(%4, %%eax, 2))\
+\
+                       "addl $8, %%eax                 \n\t"\
+                       "cmpl %5, %%eax                 \n\t"\
+                       " jb 1b                         \n\t"
+
+
 static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
@@ -752,18 +842,14 @@ static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
 /**
  * vertical scale YV12 to RGB
  */
-static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
+static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
-                           uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
+                           uint8_t *dest, int dstW, int16_t * lumMmxFilter, int16_t * chrMmxFilter, int dstY)
 {
-/*     if(flags&SWS_FULL_UV_IPOL)
-       {
-//FIXME
-       }//FULL_UV_IPOL
-       else*/
+       switch(c->dstFormat)
        {
 #ifdef HAVE_MMX
-               if(dstFormat == IMGFMT_BGR32) //FIXME untested
+       case IMGFMT_BGR32:
                {
                        asm volatile(
                                YSCALEYUV2RGBX
@@ -776,7 +862,8 @@ static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lu
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
                        );
                }
-               else if(dstFormat == IMGFMT_BGR24) //FIXME untested
+               break;
+       case IMGFMT_BGR24:
                {
                        asm volatile(
                                YSCALEYUV2RGBX
@@ -791,7 +878,8 @@ static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lu
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
                        );
                }
-               else if(dstFormat==IMGFMT_BGR15)
+               break;
+       case IMGFMT_BGR15:
                {
                        asm volatile(
                                YSCALEYUV2RGBX
@@ -811,7 +899,8 @@ static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lu
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
                        );
                }
-               else if(dstFormat==IMGFMT_BGR16)
+               break;
+       case IMGFMT_BGR16:
                {
                        asm volatile(
                                YSCALEYUV2RGBX
@@ -831,31 +920,53 @@ static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lu
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
                        );
                }
-#else
-yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
-           chrFilter, chrSrc, chrFilterSize,
-           dest, dstW, dstFormat);
+               break;
+       case IMGFMT_YUY2:
+               {
+                       asm volatile(
+                               YSCALEYUV2PACKEDX
+               /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+
+                               "psraw $3, %%mm3                \n\t"
+                               "psraw $3, %%mm4                \n\t"
+                               "psraw $3, %%mm1                \n\t"
+                               "psraw $3, %%mm7                \n\t"
+                               WRITEYUY2
 
+                       :: "m" (-lumFilterSize), "m" (-chrFilterSize),
+                          "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
+                          "r" (dest), "m" (dstW),
+                          "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
+                       : "%eax", "%ebx", "%ecx", "%edx", "%esi"
+                       );
+               }
+               break;
 #endif
-       } //!FULL_UV_IPOL
+       default:
+               yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
+                           chrFilter, chrSrc, chrFilterSize,
+                           dest, dstW, dstY);
+               break;
+       }
 }
 
-
 /**
  * vertical bilinear scale YV12 to RGB
  */
-static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
-                           uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
+static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
+                           uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
 {
        int yalpha1=yalpha^4095;
        int uvalpha1=uvalpha^4095;
+       int i;
 
+#if 0 //isnt used
        if(flags&SWS_FULL_CHR_H_INT)
        {
-
-#ifdef HAVE_MMX
-               if(dstFormat==IMGFMT_BGR32)
+               switch(dstFormat)
                {
+#ifdef HAVE_MMX
+               case IMGFMT_BGR32:
                        asm volatile(
 
 
@@ -879,9 +990,8 @@ FULL_YSCALEYUV2RGB
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR24)
-               {
+                       break;
+               case IMGFMT_BGR24:
                        asm volatile(
 
 FULL_YSCALEYUV2RGB
@@ -929,9 +1039,8 @@ FULL_YSCALEYUV2RGB
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax", "%ebx"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR15)
-               {
+                       break;
+               case IMGFMT_BGR15:
                        asm volatile(
 
 FULL_YSCALEYUV2RGB
@@ -963,9 +1072,8 @@ FULL_YSCALEYUV2RGB
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR16)
-               {
+                       break;
+               case IMGFMT_BGR16:
                        asm volatile(
 
 FULL_YSCALEYUV2RGB
@@ -997,8 +1105,12 @@ FULL_YSCALEYUV2RGB
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-#else
+               break;
+#endif
+               case IMGFMT_RGB32:
+#ifndef HAVE_MMX
+               case IMGFMT_BGR32:
+#endif
                if(dstFormat==IMGFMT_BGR32)
                {
                        int i;
@@ -1060,13 +1172,14 @@ FULL_YSCALEYUV2RGB
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
                        }
                }
-#endif
        }//FULL_UV_IPOL
        else
        {
+#endif // if 0
 #ifdef HAVE_MMX
-               if(dstFormat==IMGFMT_BGR32)
-               {
+       switch(c->dstFormat)
+       {
+       case IMGFMT_BGR32:
                        asm volatile(
                                YSCALEYUV2RGB
                                WRITEBGR32
@@ -1075,9 +1188,8 @@ FULL_YSCALEYUV2RGB
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR24)
-               {
+                       return;
+       case IMGFMT_BGR24:
                        asm volatile(
                                "movl %4, %%ebx                 \n\t"
                                YSCALEYUV2RGB
@@ -1087,9 +1199,8 @@ FULL_YSCALEYUV2RGB
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax", "%ebx"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR15)
-               {
+                       return;
+       case IMGFMT_BGR15:
                        asm volatile(
                                YSCALEYUV2RGB
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -1105,9 +1216,8 @@ FULL_YSCALEYUV2RGB
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR16)
-               {
+                       return;
+       case IMGFMT_BGR16:
                        asm volatile(
                                YSCALEYUV2RGB
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -1123,176 +1233,48 @@ FULL_YSCALEYUV2RGB
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-#else
-               if(dstFormat==IMGFMT_BGR32)
-               {
-                       int i;
-#ifdef WORDS_BIGENDIAN
-                       dest++;
-#endif
-                       for(i=0; i<dstW-1; i+=2){
-                               // vertical linear interpolation && yuv2rgb in a single step:
-                               int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
-                               int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
-                               int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
-                               int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
-
-                               int Cb= yuvtab_40cf[U];
-                               int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
-                               int Cr= yuvtab_3343[V];
-
-                               dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
-                               dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
-                               dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
-
-                               dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
-                               dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
-                               dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
-                       }
-               }
-               else if(dstFormat==IMGFMT_BGR24)
-               {
-                       int i;
-                       for(i=0; i<dstW-1; i+=2){
-                               // vertical linear interpolation && yuv2rgb in a single step:
-                               int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
-                               int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
-                               int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
-                               int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
-
-                               int Cb= yuvtab_40cf[U];
-                               int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
-                               int Cr= yuvtab_3343[V];
-
-                               dest[0]=clip_table[((Y1 + Cb) >>13)];
-                               dest[1]=clip_table[((Y1 + Cg) >>13)];
-                               dest[2]=clip_table[((Y1 + Cr) >>13)];
-
-                               dest[3]=clip_table[((Y2 + Cb) >>13)];
-                               dest[4]=clip_table[((Y2 + Cg) >>13)];
-                               dest[5]=clip_table[((Y2 + Cr) >>13)];
-                               dest+=6;
-                       }
-               }
-               else if(dstFormat==IMGFMT_BGR16)
-               {
-                       int i;
-#ifdef DITHER1XBPP
-                       static int ditherb1=1<<14;
-                       static int ditherg1=1<<13;
-                       static int ditherr1=2<<14;
-                       static int ditherb2=3<<14;
-                       static int ditherg2=3<<13;
-                       static int ditherr2=0<<14;
-
-                       ditherb1 ^= (1^2)<<14;
-                       ditherg1 ^= (1^2)<<13;
-                       ditherr1 ^= (1^2)<<14;
-                       ditherb2 ^= (3^0)<<14;
-                       ditherg2 ^= (3^0)<<13;
-                       ditherr2 ^= (3^0)<<14;
-#else
-                       const int ditherb1=0;
-                       const int ditherg1=0;
-                       const int ditherr1=0;
-                       const int ditherb2=0;
-                       const int ditherg2=0;
-                       const int ditherr2=0;
-#endif
-                       for(i=0; i<dstW-1; i+=2){
-                               // vertical linear interpolation && yuv2rgb in a single step:
-                               int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
-                               int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
-                               int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
-                               int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
-
-                               int Cb= yuvtab_40cf[U];
-                               int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
-                               int Cr= yuvtab_3343[V];
-
-                               ((uint16_t*)dest)[i] =
-                                       clip_table16b[(Y1 + Cb + ditherb1) >>13] |
-                                       clip_table16g[(Y1 + Cg + ditherg1) >>13] |
-                                       clip_table16r[(Y1 + Cr + ditherr1) >>13];
-
-                               ((uint16_t*)dest)[i+1] =
-                                       clip_table16b[(Y2 + Cb + ditherb2) >>13] |
-                                       clip_table16g[(Y2 + Cg + ditherg2) >>13] |
-                                       clip_table16r[(Y2 + Cr + ditherr2) >>13];
-                       }
-               }
-               else if(dstFormat==IMGFMT_BGR15)
-               {
-                       int i;
-#ifdef DITHER1XBPP
-                       static int ditherb1=1<<14;
-                       static int ditherg1=1<<14;
-                       static int ditherr1=2<<14;
-                       static int ditherb2=3<<14;
-                       static int ditherg2=3<<14;
-                       static int ditherr2=0<<14;
-
-                       ditherb1 ^= (1^2)<<14;
-                       ditherg1 ^= (1^2)<<14;
-                       ditherr1 ^= (1^2)<<14;
-                       ditherb2 ^= (3^0)<<14;
-                       ditherg2 ^= (3^0)<<14;
-                       ditherr2 ^= (3^0)<<14;
-#else
-                       const int ditherb1=0;
-                       const int ditherg1=0;
-                       const int ditherr1=0;
-                       const int ditherb2=0;
-                       const int ditherg2=0;
-                       const int ditherr2=0;
-#endif
-                       for(i=0; i<dstW-1; i+=2){
-                               // vertical linear interpolation && yuv2rgb in a single step:
-                               int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
-                               int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
-                               int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
-                               int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
-
-                               int Cb= yuvtab_40cf[U];
-                               int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
-                               int Cr= yuvtab_3343[V];
+                       return;
+       case IMGFMT_YUY2:
+                       asm volatile(
+                               YSCALEYUV2PACKED
+                               WRITEYUY2
 
-                               ((uint16_t*)dest)[i] =
-                                       clip_table15b[(Y1 + Cb + ditherb1) >>13] |
-                                       clip_table15g[(Y1 + Cg + ditherg1) >>13] |
-                                       clip_table15r[(Y1 + Cr + ditherr1) >>13];
-
-                               ((uint16_t*)dest)[i+1] =
-                                       clip_table15b[(Y2 + Cb + ditherb2) >>13] |
-                                       clip_table15g[(Y2 + Cg + ditherg2) >>13] |
-                                       clip_table15r[(Y2 + Cr + ditherr2) >>13];
-                       }
-               }
-#endif
-       } //!FULL_UV_IPOL
+                       :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
+                       "m" (yalpha1), "m" (uvalpha1)
+                       : "%eax"
+                       );
+                       return;
+       default: break;
+       }
+#endif //HAVE_MMX
+YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
 }
 
 /**
  * YV12 to RGB without scaling or interpolating
  */
-static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
-                           uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
+static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
+                           uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
 {
        int uvalpha1=uvalpha^4095;
        const int yalpha1=0;
+       int i;
+       
+       uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
+       const int yalpha= 4096; //FIXME ...
 
        if(flags&SWS_FULL_CHR_H_INT)
        {
-               RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
+               RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
                return;
        }
 
 #ifdef HAVE_MMX
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
        {
-               if(dstFormat==IMGFMT_BGR32)
+               switch(dstFormat)
                {
+               case IMGFMT_BGR32:
                        asm volatile(
                                YSCALEYUV2RGB1
                                WRITEBGR32
@@ -1300,9 +1282,8 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR24)
-               {
+                       return;
+               case IMGFMT_BGR24:
                        asm volatile(
                                "movl %4, %%ebx                 \n\t"
                                YSCALEYUV2RGB1
@@ -1311,9 +1292,8 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax", "%ebx"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR15)
-               {
+                       return;
+               case IMGFMT_BGR15:
                        asm volatile(
                                YSCALEYUV2RGB1
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -1327,9 +1307,8 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR16)
-               {
+                       return;
+               case IMGFMT_BGR16:
                        asm volatile(
                                YSCALEYUV2RGB1
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -1344,12 +1323,23 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
+                       return;
+               case IMGFMT_YUY2:
+                       asm volatile(
+                               YSCALEYUV2PACKED1
+                               WRITEYUY2
+                       :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
+                       "m" (yalpha1), "m" (uvalpha1)
+                       : "%eax"
+                       );
+                       return;
                }
        }
        else
        {
-               if(dstFormat==IMGFMT_BGR32)
+               switch(dstFormat)
                {
+               case IMGFMT_BGR32:
                        asm volatile(
                                YSCALEYUV2RGB1b
                                WRITEBGR32
@@ -1357,9 +1347,8 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR24)
-               {
+                       return;
+               case IMGFMT_BGR24:
                        asm volatile(
                                "movl %4, %%ebx                 \n\t"
                                YSCALEYUV2RGB1b
@@ -1368,9 +1357,8 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax", "%ebx"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR15)
-               {
+                       return;
+               case IMGFMT_BGR15:
                        asm volatile(
                                YSCALEYUV2RGB1b
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -1384,9 +1372,8 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
-               }
-               else if(dstFormat==IMGFMT_BGR16)
-               {
+                       return;
+               case IMGFMT_BGR16:
                        asm volatile(
                                YSCALEYUV2RGB1b
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
@@ -1401,156 +1388,25 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                        "m" (yalpha1), "m" (uvalpha1)
                        : "%eax"
                        );
+                       return;
+               case IMGFMT_YUY2:
+                       asm volatile(
+                               YSCALEYUV2PACKED1b
+                               WRITEYUY2
+                       :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
+                       "m" (yalpha1), "m" (uvalpha1)
+                       : "%eax"
+                       );
+                       return;
                }
        }
-#else
-//FIXME write 2 versions (for even & odd lines)
-
-       if(dstFormat==IMGFMT_BGR32)
-       {
-               int i;
-#ifdef WORDS_BIGENDIAN
-               dest++;
-#endif
-               for(i=0; i<dstW-1; i+=2){
-                       // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y1=yuvtab_2568[buf0[i]>>7];
-                       int Y2=yuvtab_2568[buf0[i+1]>>7];
-                       int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
-                       int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
-
-                       int Cb= yuvtab_40cf[U];
-                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
-                       int Cr= yuvtab_3343[V];
-
-                       dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
-                       dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
-                       dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
-
-                       dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
-                       dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
-                       dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
-               }
-       }
-       else if(dstFormat==IMGFMT_BGR24)
-       {
-               int i;
-               for(i=0; i<dstW-1; i+=2){
-                       // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y1=yuvtab_2568[buf0[i]>>7];
-                       int Y2=yuvtab_2568[buf0[i+1]>>7];
-                       int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
-                       int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
-
-                       int Cb= yuvtab_40cf[U];
-                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
-                       int Cr= yuvtab_3343[V];
-
-                       dest[0]=clip_table[((Y1 + Cb) >>13)];
-                       dest[1]=clip_table[((Y1 + Cg) >>13)];
-                       dest[2]=clip_table[((Y1 + Cr) >>13)];
-
-                       dest[3]=clip_table[((Y2 + Cb) >>13)];
-                       dest[4]=clip_table[((Y2 + Cg) >>13)];
-                       dest[5]=clip_table[((Y2 + Cr) >>13)];
-                       dest+=6;
-               }
-       }
-       else if(dstFormat==IMGFMT_BGR16)
-       {
-               int i;
-#ifdef DITHER1XBPP
-               static int ditherb1=1<<14;
-               static int ditherg1=1<<13;
-               static int ditherr1=2<<14;
-               static int ditherb2=3<<14;
-               static int ditherg2=3<<13;
-               static int ditherr2=0<<14;
-
-               ditherb1 ^= (1^2)<<14;
-               ditherg1 ^= (1^2)<<13;
-               ditherr1 ^= (1^2)<<14;
-               ditherb2 ^= (3^0)<<14;
-               ditherg2 ^= (3^0)<<13;
-               ditherr2 ^= (3^0)<<14;
-#else
-               const int ditherb1=0;
-               const int ditherg1=0;
-               const int ditherr1=0;
-               const int ditherb2=0;
-               const int ditherg2=0;
-               const int ditherr2=0;
 #endif
-               for(i=0; i<dstW-1; i+=2){
-                       // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y1=yuvtab_2568[buf0[i]>>7];
-                       int Y2=yuvtab_2568[buf0[i+1]>>7];
-                       int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
-                       int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
-
-                       int Cb= yuvtab_40cf[U];
-                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
-                       int Cr= yuvtab_3343[V];
-
-                       ((uint16_t*)dest)[i] =
-                               clip_table16b[(Y1 + Cb + ditherb1) >>13] |
-                               clip_table16g[(Y1 + Cg + ditherg1) >>13] |
-                               clip_table16r[(Y1 + Cr + ditherr1) >>13];
-
-                       ((uint16_t*)dest)[i+1] =
-                               clip_table16b[(Y2 + Cb + ditherb2) >>13] |
-                               clip_table16g[(Y2 + Cg + ditherg2) >>13] |
-                               clip_table16r[(Y2 + Cr + ditherr2) >>13];
-               }
-       }
-       else if(dstFormat==IMGFMT_BGR15)
+       if( uvalpha < 2048 )
        {
-               int i;
-#ifdef DITHER1XBPP
-               static int ditherb1=1<<14;
-               static int ditherg1=1<<14;
-               static int ditherr1=2<<14;
-               static int ditherb2=3<<14;
-               static int ditherg2=3<<14;
-               static int ditherr2=0<<14;
-
-               ditherb1 ^= (1^2)<<14;
-               ditherg1 ^= (1^2)<<14;
-               ditherr1 ^= (1^2)<<14;
-               ditherb2 ^= (3^0)<<14;
-               ditherg2 ^= (3^0)<<14;
-               ditherr2 ^= (3^0)<<14;
-#else
-               const int ditherb1=0;
-               const int ditherg1=0;
-               const int ditherr1=0;
-               const int ditherb2=0;
-               const int ditherg2=0;
-               const int ditherr2=0;
-#endif
-               for(i=0; i<dstW-1; i+=2){
-                       // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y1=yuvtab_2568[buf0[i]>>7];
-                       int Y2=yuvtab_2568[buf0[i+1]>>7];
-                       int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
-                       int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
-
-                       int Cb= yuvtab_40cf[U];
-                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
-                       int Cr= yuvtab_3343[V];
-
-                       ((uint16_t*)dest)[i] =
-                               clip_table15b[(Y1 + Cb + ditherb1) >>13] |
-                               clip_table15g[(Y1 + Cg + ditherg1) >>13] |
-                               clip_table15r[(Y1 + Cr + ditherr1) >>13];
-
-                       ((uint16_t*)dest)[i+1] =
-                               clip_table15b[(Y2 + Cb + ditherb2) >>13] |
-                               clip_table15g[(Y2 + Cg + ditherg2) >>13] |
-                               clip_table15r[(Y2 + Cr + ditherr2) >>13];
-               }
+               YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
+       }else{
+               YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
        }
-#endif
 }
 
 //FIXME yuy2* can read upto 7 samples to much
@@ -2785,7 +2641,8 @@ i--;
            {
                if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
                {
-                       if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
+                       const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
+                       if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
                        {
                                int16_t *lumBuf = lumPixBuf[0];
@@ -2814,24 +2671,24 @@ i--;
                        {
                                int chrAlpha= vChrFilter[2*dstY+1];
 
-                               RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
-                                                dest, dstW, chrAlpha, dstFormat, flags);
+                               RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
+                                                dest, dstW, chrAlpha, dstFormat, flags, dstY);
                        }
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
                        {
                                int lumAlpha= vLumFilter[2*dstY+1];
                                int chrAlpha= vChrFilter[2*dstY+1];
 
-                               RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
-                                                dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
+                               RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
+                                                dest, dstW, lumAlpha, chrAlpha, dstY);
                        }
                        else //General RGB
                        {
-                               RENAME(yuv2rgbX)(
+                               RENAME(yuv2packedX)(c,
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                       dest, dstW, dstFormat,
-                                       lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
+                                       dest, dstW,
+                                       lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4, dstY);
                        }
                }
             }
@@ -2839,9 +2696,10 @@ i--;
            {
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-               if(isPlanarYUV(dstFormat)) //YV12
+               if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
                {
-                       if(dstY&1) uDest=vDest= NULL;
+                       const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
+                       if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
                        yuv2yuvXinC(
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
@@ -2851,10 +2709,10 @@ i--;
                {
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
-                       yuv2rgbXinC(
+                       yuv2packedXinC(c, 
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                               dest, dstW, dstFormat);
+                               dest, dstW, dstY);
                }
            }
        }