swscale: rearrange code.

[ffmpeg] / libswscale / swscale.c
diff --git a/libswscale/swscale.c b/libswscale/swscale.c

index 9705b14d00e0d22c774b9822ba1e449f67af7abc..68508a974169455b8fe4e0ddeb23ceb343c36189 100644 (file)
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -285,42 +285,13 @@ yuv2NBPS(10, LE, 0);
  yuv2NBPS(16, BE, 1);
  yuv2NBPS(16, LE, 0);
  
-static inline void yuv2yuvX16_c(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
-                                const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize,
-                                const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, int dstW, int chrDstW,
-                                enum PixelFormat dstFormat)
-{
-#define conv16(bits) \
-    if (isBE(dstFormat)) { \
-        yuv2yuvX ## bits ## BE_c(c, lumFilter, lumSrc, lumFilterSize, \
-                                 chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                                 alpSrc, \
-                                 dest, uDest, vDest, aDest, \
-                                 dstW, chrDstW); \
-    } else { \
-        yuv2yuvX ## bits ## LE_c(c, lumFilter, lumSrc, lumFilterSize, \
-                                 chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
-                                 alpSrc, \
-                                 dest, uDest, vDest, aDest, \
-                                 dstW, chrDstW); \
-    }
-    if (is16BPS(dstFormat)) {
-        conv16(16);
-    } else if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
-        conv16(9);
-    } else {
-        conv16(10);
-    }
-#undef conv16
-}
-
-static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
-                              const int16_t **lumSrc, int lumFilterSize,
-                              const int16_t *chrFilter, const int16_t **chrUSrc,
-                              const int16_t **chrVSrc,
-                              int chrFilterSize, const int16_t **alpSrc,
-                              uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
-                              uint8_t *aDest, int dstW, int chrDstW)
+static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
+                       const int16_t **lumSrc, int lumFilterSize,
+                       const int16_t *chrFilter, const int16_t **chrUSrc,
+                       const int16_t **chrVSrc,
+                       int chrFilterSize, const int16_t **alpSrc,
+                       uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
+                       uint8_t *aDest, int dstW, int chrDstW)
  {
      //FIXME Optimize (just quickly written not optimized..)
      int i;
@@ -356,16 +327,45 @@ static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
  
              aDest[i]= av_clip_uint8(val>>19);
          }
+}
+
+static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
+                       const int16_t *chrUSrc, const int16_t *chrVSrc,
+                       const int16_t *alpSrc,
+                       uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
+                       uint8_t *aDest, int dstW, int chrDstW)
+{
+    int i;
+    for (i=0; i<dstW; i++) {
+        int val= (lumSrc[i]+64)>>7;
+        dest[i]= av_clip_uint8(val);
+    }
+
+    if (uDest)
+        for (i=0; i<chrDstW; i++) {
+            int u=(chrUSrc[i]+64)>>7;
+            int v=(chrVSrc[i]+64)>>7;
+            uDest[i]= av_clip_uint8(u);
+            vDest[i]= av_clip_uint8(v);
+        }
  
+    if (CONFIG_SWSCALE_ALPHA && aDest)
+        for (i=0; i<dstW; i++) {
+            int val= (alpSrc[i]+64)>>7;
+            aDest[i]= av_clip_uint8(val);
+        }
  }
  
-static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
-                               const int16_t **lumSrc, int lumFilterSize,
-                               const int16_t *chrFilter, const int16_t **chrUSrc,
-                               const int16_t **chrVSrc,
-                               int chrFilterSize, uint8_t *dest, uint8_t *uDest,
-                               int dstW, int chrDstW, enum PixelFormat dstFormat)
+static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
+                        const int16_t **lumSrc, int lumFilterSize,
+                        const int16_t *chrFilter, const int16_t **chrUSrc,
+                        const int16_t **chrVSrc, int chrFilterSize,
+                        const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest,
+                        uint8_t *vDest, uint8_t *aDest,
+                        int dstW, int chrDstW)
  {
+    enum PixelFormat dstFormat = c->dstFormat;
+
      //FIXME Optimize (just quickly written not optimized..)
      int i;
      for (i=0; i<dstW; i++) {
@@ -408,7 +408,127 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
          }
  }
  
-#define YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha) \
+static av_always_inline void
+yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
+                        const int16_t **lumSrc, int lumFilterSize,
+                        const int16_t *chrFilter, const int16_t **chrUSrc,
+                        const int16_t **chrVSrc, int chrFilterSize,
+                        const int16_t **alpSrc, uint8_t *dest, int dstW,
+                        int y, enum PixelFormat target)
+{
+    int i;
+
+#define output_pixel(pos, val) \
+        if (target == PIX_FMT_GRAY16BE) { \
+            AV_WB16(pos, val); \
+        } else { \
+            AV_WL16(pos, val); \
+        }
+    for (i = 0; i < (dstW >> 1); i++) {
+        int j;
+        int Y1 = 1 << 18;
+        int Y2 = 1 << 18;
+        const int i2 = 2 * i;
+
+        for (j = 0; j < lumFilterSize; j++) {
+            Y1 += lumSrc[j][i2]   * lumFilter[j];
+            Y2 += lumSrc[j][i2+1] * lumFilter[j];
+        }
+        Y1 >>= 11;
+        Y2 >>= 11;
+        if ((Y1 | Y2) & 0x10000) {
+            Y1 = av_clip_uint16(Y1);
+            Y2 = av_clip_uint16(Y2);
+        }
+        output_pixel(&dest[2 * i2 + 0], Y1);
+        output_pixel(&dest[2 * i2 + 2], Y2);
+    }
+}
+
+static av_always_inline void
+yuv2gray16_2_c_template(SwsContext *c, const uint16_t *buf0,
+                        const uint16_t *buf1, const uint16_t *ubuf0,
+                        const uint16_t *ubuf1, const uint16_t *vbuf0,
+                        const uint16_t *vbuf1, const uint16_t *abuf0,
+                        const uint16_t *abuf1, uint8_t *dest, int dstW,
+                        int yalpha, int uvalpha, int y,
+                        enum PixelFormat target)
+{
+    int  yalpha1 = 4095 - yalpha; \
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        const int i2 = 2 * i;
+        int Y1 = (buf0[i2  ] * yalpha1 + buf1[i2  ] * yalpha) >> 11;
+        int Y2 = (buf0[i2+1] * yalpha1 + buf1[i2+1] * yalpha) >> 11;
+
+        output_pixel(&dest[2 * i2 + 0], Y1);
+        output_pixel(&dest[2 * i2 + 2], Y2);
+    }
+}
+
+static av_always_inline void
+yuv2gray16_1_c_template(SwsContext *c, const uint16_t *buf0,
+                        const uint16_t *ubuf0, const uint16_t *ubuf1,
+                        const uint16_t *vbuf0, const uint16_t *vbuf1,
+                        const uint16_t *abuf0, uint8_t *dest, int dstW,
+                        int uvalpha, enum PixelFormat dstFormat,
+                        int flags, int y, enum PixelFormat target)
+{
+    int i;
+
+    for (i = 0; i < (dstW >> 1); i++) {
+        const int i2 = 2 * i;
+        int Y1 = buf0[i2  ] << 1;
+        int Y2 = buf0[i2+1] << 1;
+
+        output_pixel(&dest[2 * i2 + 0], Y1);
+        output_pixel(&dest[2 * i2 + 2], Y2);
+    }
+#undef output_pixel
+}
+
+#define YUV2PACKEDWRAPPER(name, ext, fmt) \
+static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
+                        const int16_t **lumSrc, int lumFilterSize, \
+                        const int16_t *chrFilter, const int16_t **chrUSrc, \
+                        const int16_t **chrVSrc, int chrFilterSize, \
+                        const int16_t **alpSrc, uint8_t *dest, int dstW, \
+                        int y) \
+{ \
+    name ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
+                          chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+                          alpSrc, dest, dstW, y, fmt); \
+} \
+ \
+static void name ## ext ## _2_c(SwsContext *c, const uint16_t *buf0, \
+                        const uint16_t *buf1, const uint16_t *ubuf0, \
+                        const uint16_t *ubuf1, const uint16_t *vbuf0, \
+                        const uint16_t *vbuf1, const uint16_t *abuf0, \
+                        const uint16_t *abuf1, uint8_t *dest, int dstW, \
+                        int yalpha, int uvalpha, int y) \
+{ \
+    name ## _2_c_template(c, buf0, buf1, ubuf0, ubuf1, \
+                          vbuf0, vbuf1, abuf0, abuf1, \
+                          dest, dstW, yalpha, uvalpha, y, fmt); \
+} \
+ \
+static void name ## ext ## _1_c(SwsContext *c, const uint16_t *buf0, \
+                        const uint16_t *ubuf0, const uint16_t *ubuf1, \
+                        const uint16_t *vbuf0, const uint16_t *vbuf1, \
+                        const uint16_t *abuf0, uint8_t *dest, int dstW, \
+                        int uvalpha, enum PixelFormat dstFormat, \
+                        int flags, int y) \
+{ \
+    name ## _1_c_template(c, buf0, ubuf0, ubuf1, vbuf0, \
+                          vbuf1, abuf0, dest, dstW, uvalpha, \
+                          dstFormat, flags, y, fmt); \
+}
+
+YUV2PACKEDWRAPPER(yuv2gray16, LE, PIX_FMT_GRAY16LE);
+YUV2PACKEDWRAPPER(yuv2gray16, BE, PIX_FMT_GRAY16BE);
+
+#define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
      for (i=0; i<(dstW>>1); i++) {\
          int j;\
          int Y1 = 1<<18;\
@@ -431,6 +551,12 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
          Y2>>=19;\
          U >>=19;\
          V >>=19;\
+        if ((Y1|Y2|U|V)&0x100) {\
+            Y1 = av_clip_uint8(Y1); \
+            Y2 = av_clip_uint8(Y2); \
+            U  = av_clip_uint8(U); \
+            V  = av_clip_uint8(V); \
+        }\
          if (alpha) {\
              A1 = 1<<18;\
              A2 = 1<<18;\
@@ -440,26 +566,13 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
              }\
              A1>>=19;\
              A2>>=19;\
+            if ((A1|A2)&0x100) {\
+                A1 = av_clip_uint8(A1); \
+                A2 = av_clip_uint8(A2); \
+            }\
          }
  
-#define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
-        YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha)\
-        if ((Y1|Y2|U|V)&256) {\
-            if (Y1>255)   Y1=255; \
-            else if (Y1<0)Y1=0;   \
-            if (Y2>255)   Y2=255; \
-            else if (Y2<0)Y2=0;   \
-            if (U>255)    U=255;  \
-            else if (U<0) U=0;    \
-            if (V>255)    V=255;  \
-            else if (V<0) V=0;    \
-        }\
-        if (alpha && ((A1|A2)&256)) {\
-            A1=av_clip_uint8(A1);\
-            A2=av_clip_uint8(A2);\
-        }
-
-#define YSCALE_YUV_2_PACKEDX_FULL_C(rnd,alpha) \
+#define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
      for (i=0; i<dstW; i++) {\
          int j;\
          int Y = 0;\
@@ -483,12 +596,9 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
              for (j=0; j<lumFilterSize; j++)\
                  A += alpSrc[j][i     ] * lumFilter[j];\
              A >>=19;\
-            if (A&256)\
+            if (A&0x100)\
                  A = av_clip_uint8(A);\
-        }
-
-#define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
-    YSCALE_YUV_2_PACKEDX_FULL_C(rnd>>3,alpha)\
+        }\
          Y-= c->yuv2rgb_y_offset;\
          Y*= c->yuv2rgb_y_coeff;\
          Y+= rnd;\
@@ -496,35 +606,9 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
          G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
          B= Y +                          U*c->yuv2rgb_u2b_coeff;\
          if ((R|G|B)&(0xC0000000)) {\
-            if (R>=(256<<22))   R=(256<<22)-1; \
-            else if (R<0)R=0;   \
-            if (G>=(256<<22))   G=(256<<22)-1; \
-            else if (G<0)G=0;   \
-            if (B>=(256<<22))   B=(256<<22)-1; \
-            else if (B<0)B=0;   \
-        }
-
-#define YSCALE_YUV_2_GRAY16_C \
-    for (i=0; i<(dstW>>1); i++) {\
-        int j;\
-        int Y1 = 1<<18;\
-        int Y2 = 1<<18;\
-        int U  = 1<<18;\
-        int V  = 1<<18;\
-        \
-        const int i2= 2*i;\
-        \
-        for (j=0; j<lumFilterSize; j++) {\
-            Y1 += lumSrc[j][i2] * lumFilter[j];\
-            Y2 += lumSrc[j][i2+1] * lumFilter[j];\
-        }\
-        Y1>>=11;\
-        Y2>>=11;\
-        if ((Y1|Y2|U|V)&65536) {\
-            if (Y1>65535)   Y1=65535; \
-            else if (Y1<0)Y1=0;   \
-            if (Y2>65535)   Y2=65535; \
-            else if (Y2<0)Y2=0;   \
+            R = av_clip_uintp2(R, 30); \
+            G = av_clip_uintp2(G, 30); \
+            B = av_clip_uintp2(B, 30); \
          }
  
  #define YSCALE_YUV_2_RGBX_C(type,alpha) \
@@ -547,12 +631,6 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
              A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
          }
  
-#define YSCALE_YUV_2_GRAY16_2_C   \
-    for (i=0; i<(dstW>>1); i++) { \
-        const int i2= 2*i;       \
-        int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>11;           \
-        int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>11;
-
  #define YSCALE_YUV_2_RGB2_C(type,alpha) \
      YSCALE_YUV_2_PACKED2_C(type,alpha)\
      r = (type *)c->table_rV[V];\
@@ -573,12 +651,6 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
              A2= abuf0[i2+1]>>7;\
          }
  
-#define YSCALE_YUV_2_GRAY16_1_C \
-    for (i=0; i<(dstW>>1); i++) {\
-        const int i2= 2*i;\
-        int Y1= buf0[i2  ]<<1;\
-        int Y2= buf0[i2+1]<<1;
-
  #define YSCALE_YUV_2_RGB1_C(type,alpha) \
      YSCALE_YUV_2_PACKED1_C(type,alpha)\
      r = (type *)c->table_rV[V];\
@@ -637,11 +709,9 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
          }\
          Y1>>=19;\
          Y2>>=19;\
-        if ((Y1|Y2)&256) {\
-            if (Y1>255)   Y1=255;\
-            else if (Y1<0)Y1=0;\
-            if (Y2>255)   Y2=255;\
-            else if (Y2<0)Y2=0;\
+        if ((Y1|Y2)&0x100) {\
+            Y1 = av_clip_uint8(Y1); \
+            Y2 = av_clip_uint8(Y2); \
          }\
          acc+= acc + g[Y1+d128[(i+0)&7]];\
          acc+= acc + g[Y2+d128[(i+1)&7]];\
@@ -651,7 +721,7 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
          }\
      }
  
-#define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
+#define YSCALE_YUV_2_ANYRGB_C(func, func2, func_monoblack)\
      switch(c->dstFormat) {\
      case PIX_FMT_RGB48BE:\
      case PIX_FMT_RGB48LE:\
@@ -855,22 +925,6 @@ static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
              ((uint8_t*)dest)[2*i2+3]= Y2;\
          }                \
          break;\
-    case PIX_FMT_GRAY16BE:\
-        func_g16\
-            ((uint8_t*)dest)[2*i2+0]= Y1>>8;\
-            ((uint8_t*)dest)[2*i2+1]= Y1;\
-            ((uint8_t*)dest)[2*i2+2]= Y2>>8;\
-            ((uint8_t*)dest)[2*i2+3]= Y2;\
-        }                \
-        break;\
-    case PIX_FMT_GRAY16LE:\
-        func_g16\
-            ((uint8_t*)dest)[2*i2+0]= Y1;\
-            ((uint8_t*)dest)[2*i2+1]= Y1>>8;\
-            ((uint8_t*)dest)[2*i2+2]= Y2;\
-            ((uint8_t*)dest)[2*i2+3]= Y2>>8;\
-        }                \
-        break;\
      }
  
  static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
@@ -880,14 +934,14 @@ static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
                            const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
  {
      int i;
-    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
+    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_MONOX_C)
  }
  
-static inline void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
-                                   const int16_t **lumSrc, int lumFilterSize,
-                                   const int16_t *chrFilter, const int16_t **chrUSrc,
-                                   const int16_t **chrVSrc, int chrFilterSize,
-                                   const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
+static void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
+                            const int16_t **lumSrc, int lumFilterSize,
+                            const int16_t *chrFilter, const int16_t **chrUSrc,
+                            const int16_t **chrVSrc, int chrFilterSize,
+                            const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
  {
      int i;
      int step= c->dstFormatBpp/8;
@@ -969,7 +1023,49 @@ static inline void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
      }
  }
  
-static void fillPlane(uint8_t* plane, int stride, int width, int height, int y, uint8_t val)
+/**
+ * vertical bilinear scale YV12 to RGB
+ */
+static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
+                          const uint16_t *buf1, const uint16_t *ubuf0,
+                          const uint16_t *ubuf1, const uint16_t *vbuf0,
+                          const uint16_t *vbuf1, const uint16_t *abuf0,
+                          const uint16_t *abuf1, uint8_t *dest, int dstW,
+                          int yalpha, int uvalpha, int y)
+{
+    int  yalpha1=4095- yalpha;
+    int uvalpha1=4095-uvalpha;
+    int i;
+
+    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_MONO2_C)
+}
+
+/**
+ * YV12 to RGB without scaling or interpolating
+ */
+static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
+                          const uint16_t *ubuf0, const uint16_t *ubuf1,
+                          const uint16_t *vbuf0, const uint16_t *vbuf1,
+                          const uint16_t *abuf0, uint8_t *dest, int dstW,
+                          int uvalpha, enum PixelFormat dstFormat,
+                          int flags, int y)
+{
+    const int yalpha1=0;
+    int i;
+
+    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+    const int yalpha= 4096; //FIXME ...
+
+    if (uvalpha < 2048) {
+        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_MONO2_C)
+    } else {
+        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_MONO2_C)
+    }
+}
+
+static av_always_inline void fillPlane(uint8_t* plane, int stride,
+                                       int width, int height,
+                                       int y, uint8_t val)
  {
      int i;
      uint8_t *ptr = plane + stride*y;
@@ -979,58 +1075,85 @@ static void fillPlane(uint8_t* plane, int stride, int width, int height, int y,
      }
  }
  
-#define rgb48funcs(LE_BE, rfunc, compA, compB, compC) \
-static void compA ## compB ## compC ## 48 ## LE_BE ## ToY_c( \
-                       uint8_t *dst, const uint8_t *src, int width, \
-                       uint32_t *unused) \
+static av_always_inline void
+rgb48ToY_c_template(uint8_t *dst, const uint8_t *src, int width,
+                    enum PixelFormat origin)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+        int a = input_pixel(&src[i*6+0]) >> 8;
+        int g = input_pixel(&src[i*6+2]) >> 8;
+        int c = input_pixel(&src[i*6+4]) >> 8;
+
+#define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? c : a)
+#define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? a : c)
+        dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void
+rgb48ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
+                    const uint8_t *src1, const uint8_t *src2,
+                    int width, enum PixelFormat origin)
+{
+    int i;
+    assert(src1==src2);
+    for (i = 0; i < width; i++) {
+        int a = input_pixel(&src1[6*i + 0]) >> 8;
+        int g = input_pixel(&src1[6*i + 2]) >> 8;
+        int c = input_pixel(&src1[6*i + 4]) >> 8;
+
+        dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static av_always_inline void
+rgb48ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
+                          const uint8_t *src1, const uint8_t *src2,
+                          int width, enum PixelFormat origin)
+{
+    int i;
+    assert(src1==src2);
+    for (i = 0; i < width; i++) {
+        int a = (input_pixel(&src1[12*i + 0]) >> 8) + (input_pixel(&src1[12*i + 6]) >> 8);
+        int g = (input_pixel(&src1[12*i + 2]) >> 8) + (input_pixel(&src1[12*i + 8]) >> 8);
+        int c = (input_pixel(&src1[12*i + 4]) >> 8) + (input_pixel(&src1[12*i + 10]) >> 8);
+
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
+    }
+#undef r
+#undef b
+#undef input_pixel
+}
+
+#define rgb48funcs(pattern, BE_LE, origin) \
+static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *dst, const uint8_t *src, \
+                                    int width, uint32_t *unused) \
  { \
-    int i; \
-    for (i = 0; i < width; i++) { \
-        int compA = rfunc(&src[i*6+0]) >> 8; \
-        int compB = rfunc(&src[i*6+2]) >> 8; \
-        int compC = rfunc(&src[i*6+4]) >> 8; \
- \
-        dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
-    } \
+    rgb48ToY_c_template(dst, src, width, origin); \
  } \
   \
-static void compA ## compB ## compC ## 48 ## LE_BE ## ToUV_c( \
-                        uint8_t *dstU, uint8_t *dstV, \
-                        const uint8_t *src1, const uint8_t *src2, \
-                        int width, uint32_t *unused) \
+static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
+                                    const uint8_t *src1, const uint8_t *src2, \
+                                    int width, uint32_t *unused) \
  { \
-    int i; \
-    assert(src1==src2); \
-    for (i = 0; i < width; i++) { \
-        int compA = rfunc(&src1[6*i + 0]) >> 8; \
-        int compB = rfunc(&src1[6*i + 2]) >> 8; \
-        int compC = rfunc(&src1[6*i + 4]) >> 8; \
- \
-        dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
-        dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
-    } \
+    rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
  } \
   \
-static void compA ## compB ## compC ## 48 ## LE_BE ## ToUV_half_c( \
-                            uint8_t *dstU, uint8_t *dstV, \
-                            const uint8_t *src1, const uint8_t *src2, \
-                            int width, uint32_t *unused) \
+static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
+                                    const uint8_t *src1, const uint8_t *src2, \
+                                    int width, uint32_t *unused) \
  { \
-    int i; \
-    assert(src1==src2); \
-    for (i = 0; i < width; i++) { \
-        int compA = (rfunc(&src1[12*i + 0]) >> 8) + (rfunc(&src1[12*i + 6]) >> 8); \
-        int compB = (rfunc(&src1[12*i + 2]) >> 8) + (rfunc(&src1[12*i + 8]) >> 8); \
-        int compC = (rfunc(&src1[12*i + 4]) >> 8) + (rfunc(&src1[12*i + 10]) >> 8); \
- \
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1); \
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1); \
-    } \
+    rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
  }
-rgb48funcs(LE, AV_RL16, r, g, b);
-rgb48funcs(BE, AV_RB16, r, g, b);
-rgb48funcs(LE, AV_RL16, b, g, r);
-rgb48funcs(BE, AV_RB16, b, g, r);
+
+rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
+rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
+rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
+rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
  
  #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
  static void name ## _c(uint8_t *dst, const uint8_t *src, \
@@ -1055,22 +1178,6 @@ BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY
  BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
  BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
  
-static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dst[i]= src[4*i];
-    }
-}
-
-static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dst[i]= src[4*i+3];
-    }
-}
-
  #define BGR2UV(type, name, shr, shg, shb, shp, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S) \
  static void name ## _c(uint8_t *dstU, uint8_t *dstV, \
                         const uint8_t *src, const uint8_t *dummy, \
@@ -1115,6 +1222,22 @@ BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<
  BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
  BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
  
+static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i];
+    }
+}
+
+static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
+{
+    int i;
+    for (i=0; i<width; i++) {
+        dst[i]= src[4*i+3];
+    }
+}
+
  static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
  {
      int i;
@@ -1161,73 +1284,6 @@ static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
      }
  }
  
-static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
-                       const int16_t *chrUSrc, const int16_t *chrVSrc,
-                       const int16_t *alpSrc,
-                       uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
-                       uint8_t *aDest, int dstW, int chrDstW)
-{
-    int i;
-    for (i=0; i<dstW; i++) {
-        int val= (lumSrc[i]+64)>>7;
-        dest[i]= av_clip_uint8(val);
-    }
-
-    if (uDest)
-        for (i=0; i<chrDstW; i++) {
-            int u=(chrUSrc[i]+64)>>7;
-            int v=(chrVSrc[i]+64)>>7;
-            uDest[i]= av_clip_uint8(u);
-            vDest[i]= av_clip_uint8(v);
-        }
-
-    if (CONFIG_SWSCALE_ALPHA && aDest)
-        for (i=0; i<dstW; i++) {
-            int val= (alpSrc[i]+64)>>7;
-            aDest[i]= av_clip_uint8(val);
-        }
-}
-
-/**
- * vertical bilinear scale YV12 to RGB
- */
-static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
-                          const uint16_t *buf1, const uint16_t *ubuf0,
-                          const uint16_t *ubuf1, const uint16_t *vbuf0,
-                          const uint16_t *vbuf1, const uint16_t *abuf0,
-                          const uint16_t *abuf1, uint8_t *dest, int dstW,
-                          int yalpha, int uvalpha, int y)
-{
-    int  yalpha1=4095- yalpha;
-    int uvalpha1=4095-uvalpha;
-    int i;
-
-    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
-}
-
-/**
- * YV12 to RGB without scaling or interpolating
- */
-static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
-                          const uint16_t *ubuf0, const uint16_t *ubuf1,
-                          const uint16_t *vbuf0, const uint16_t *vbuf1,
-                          const uint16_t *abuf0, uint8_t *dest, int dstW,
-                          int uvalpha, enum PixelFormat dstFormat,
-                          int flags, int y)
-{
-    const int yalpha1=0;
-    int i;
-
-    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
-    const int yalpha= 4096; //FIXME ...
-
-    if (uvalpha < 2048) {
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
-    } else {
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
-    }
-}
-
  //FIXME yuy2* can read up to 7 samples too much
  
  static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
@@ -1315,33 +1371,51 @@ static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
  }
  
  // FIXME Maybe dither instead.
-#define YUV_NBPS(depth, endianness, rfunc) \
-static void endianness ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
-                                          const uint8_t *_srcU, const uint8_t *_srcV, \
-                                          int width, uint32_t *unused) \
+static av_always_inline void
+yuv9_OR_10ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
+                          const uint8_t *_srcU, const uint8_t *_srcV,
+                          int width, enum PixelFormat origin, int depth)
+{
+    int i;
+    const uint16_t *srcU = (const uint16_t *) _srcU;
+    const uint16_t *srcV = (const uint16_t *) _srcV;
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+    for (i = 0; i < width; i++) {
+        dstU[i] = input_pixel(&srcU[i]) >> (depth - 8);
+        dstV[i] = input_pixel(&srcV[i]) >> (depth - 8);
+    }
+}
+
+static av_always_inline void
+yuv9_or_10ToY_c_template(uint8_t *dstY, const uint8_t *_srcY,
+                         int width, enum PixelFormat origin, int depth)
+{
+    int i;
+    const uint16_t *srcY = (const uint16_t*)_srcY;
+
+    for (i = 0; i < width; i++)
+        dstY[i] = input_pixel(&srcY[i]) >> (depth - 8);
+#undef input_pixel
+}
+
+#define YUV_NBPS(depth, BE_LE, origin) \
+static void BE_LE ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
+                                     const uint8_t *srcU, const uint8_t *srcV, \
+                                     int width, uint32_t *unused) \
  { \
-    int i; \
-    const uint16_t *srcU = (const uint16_t*)_srcU; \
-    const uint16_t *srcV = (const uint16_t*)_srcV; \
-    for (i = 0; i < width; i++) { \
-        dstU[i] = rfunc(&srcU[i])>>(depth-8); \
-        dstV[i] = rfunc(&srcV[i])>>(depth-8); \
-    } \
+    yuv9_OR_10ToUV_c_template(dstU, dstV, srcU, srcV, width, origin, depth); \
  } \
-\
-static void endianness ## depth ## ToY_c(uint8_t *dstY, const uint8_t *_srcY, \
-                                         int width, uint32_t *unused) \
+static void BE_LE ## depth ## ToY_c(uint8_t *dstY, const uint8_t *srcY, \
+                                    int width, uint32_t *unused) \
  { \
-    int i; \
-    const uint16_t *srcY = (const uint16_t*)_srcY; \
-    for (i = 0; i < width; i++) \
-        dstY[i] = rfunc(&srcY[i])>>(depth-8); \
-} \
+    yuv9_or_10ToY_c_template(dstY, srcY, width, origin, depth); \
+}
  
-YUV_NBPS( 9, LE, AV_RL16)
-YUV_NBPS( 9, BE, AV_RB16)
-YUV_NBPS(10, LE, AV_RL16)
-YUV_NBPS(10, BE, AV_RB16)
+YUV_NBPS( 9, LE, PIX_FMT_YUV420P9LE);
+YUV_NBPS( 9, BE, PIX_FMT_YUV420P9BE);
+YUV_NBPS(10, LE, PIX_FMT_YUV420P10LE);
+YUV_NBPS(10, BE, PIX_FMT_YUV420P10BE);
  
  static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
                         int width, uint32_t *unused)
@@ -1495,12 +1569,12 @@ static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
  }
  
  // *** horizontal scale Y line to temp buffer
-static inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
-                           const uint8_t *src, int srcW, int xInc,
-                           const int16_t *hLumFilter,
-                           const int16_t *hLumFilterPos, int hLumFilterSize,
-                           uint8_t *formatConvBuffer,
-                           uint32_t *pal, int isAlpha)
+static av_always_inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
+                                     const uint8_t *src, int srcW, int xInc,
+                                     const int16_t *hLumFilter,
+                                     const int16_t *hLumFilterPos, int hLumFilterSize,
+                                     uint8_t *formatConvBuffer,
+                                     uint32_t *pal, int isAlpha)
  {
      void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
      void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
@@ -1535,11 +1609,11 @@ static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
      }
  }
  
-static inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
-                           const uint8_t *src1, const uint8_t *src2,
-                           int srcW, int xInc, const int16_t *hChrFilter,
-                           const int16_t *hChrFilterPos, int hChrFilterSize,
-                           uint8_t *formatConvBuffer, uint32_t *pal)
+static av_always_inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
+                                     const uint8_t *src1, const uint8_t *src2,
+                                     int srcW, int xInc, const int16_t *hChrFilter,
+                                     const int16_t *hChrFilterPos, int hChrFilterSize,
+                                     uint8_t *formatConvBuffer, uint32_t *pal)
  {
      if (c->chrToYV12) {
          uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
@@ -1559,6 +1633,51 @@ static inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int ds
          c->chrConvertRange(dst1, dst2, dstWidth);
  }
  
+static av_always_inline void
+find_c_packed_planar_out_funcs(SwsContext *c,
+                               yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
+                               yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
+                               yuv2packedX_fn *yuv2packedX)
+{
+    enum PixelFormat dstFormat = c->dstFormat;
+
+    if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
+        *yuv2yuvX     = yuv2nv12X_c;
+    } else if (is16BPS(dstFormat)) {
+        *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
+    } else if (is9_OR_10BPS(dstFormat)) {
+        if (dstFormat == PIX_FMT_YUV420P9BE || dstFormat == PIX_FMT_YUV420P9LE) {
+            *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
+        } else {
+            *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
+        }
+    } else {
+        *yuv2yuv1     = yuv2yuv1_c;
+        *yuv2yuvX     = yuv2yuvX_c;
+    }
+    if(c->flags & SWS_FULL_CHR_H_INT) {
+        *yuv2packedX = yuv2rgbX_c_full;
+    } else {
+        switch (dstFormat) {
+        case PIX_FMT_GRAY16BE:
+            *yuv2packed1 = yuv2gray16BE_1_c;
+            *yuv2packed2 = yuv2gray16BE_2_c;
+            *yuv2packedX = yuv2gray16BE_X_c;
+            break;
+        case PIX_FMT_GRAY16LE:
+            *yuv2packed1 = yuv2gray16LE_1_c;
+            *yuv2packed2 = yuv2gray16LE_2_c;
+            *yuv2packedX = yuv2gray16LE_X_c;
+            break;
+        default:
+            *yuv2packed1 = yuv2packed1_c;
+            *yuv2packed2 = yuv2packed2_c;
+            *yuv2packedX = yuv2packedX_c;
+            break;
+        }
+    }
+}
+
  #define DEBUG_SWSCALE_BUFFERS 0
  #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
  
@@ -1602,6 +1721,11 @@ static int swScale(SwsContext *c, const uint8_t* src[],
      const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
      int lastDstY;
      uint32_t *pal=c->pal_yuv;
+    yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
+    yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
+    yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
+    yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
+    yuv2packedX_fn yuv2packedX = c->yuv2packedX;
  
      /* vars which will change and which we need to store back in the context */
      int dstY= c->dstY;
@@ -1738,19 +1862,19 @@ static int swScale(SwsContext *c, const uint8_t* src[],
  #if HAVE_MMX
          updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
  #endif
-        if (dstY < dstH-2) {
+        if (dstY >= dstH-2) {
+            // hmm looks like we can't use MMX here without overwriting this array's tail
+            find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
+                                           &yuv2packed1, &yuv2packed2,
+                                           &yuv2packedX);
+        }
+
+        {
              const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
              const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
              const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
              const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
-                c->yuv2nv12X(c,
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                             vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                             dest, uDest, dstW, chrDstW, dstFormat);
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
+            if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
                  const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                  if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
                  if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
@@ -1758,10 +1882,10 @@ static int swScale(SwsContext *c, const uint8_t* src[],
                      const int16_t *chrUBuf= chrUSrcPtr[0];
                      const int16_t *chrVBuf= chrVSrcPtr[0];
                      const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
-                    c->yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
+                    yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
                                  uDest, vDest, aDest, dstW, chrDstW);
                  } else { //General YV12
-                    c->yuv2yuvX(c,
+                    yuv2yuvX(c,
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
                                  vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
                                  chrVSrcPtr, vChrFilterSize,
@@ -1772,7 +1896,7 @@ static int swScale(SwsContext *c, const uint8_t* src[],
                  assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
                  if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
                      int chrAlpha= vChrFilter[2*dstY+1];
-                    c->yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
+                    yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
                                     *chrVSrcPtr, *(chrVSrcPtr+1),
                                     alpPixBuf ? *alpSrcPtr : NULL,
                                     dest, dstW, chrAlpha, dstFormat, flags, dstY);
@@ -1783,61 +1907,17 @@ static int swScale(SwsContext *c, const uint8_t* src[],
                      lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
                      chrMmxFilter[2]=
                      chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
-                    c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
+                    yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
                                     *chrVSrcPtr, *(chrVSrcPtr+1),
                                     alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
                                     dest, dstW, lumAlpha, chrAlpha, dstY);
                  } else { //general RGB
-                    c->yuv2packedX(c,
+                    yuv2packedX(c,
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
                                     vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
                                     alpSrcPtr, dest, dstW, dstY);
                  }
              }
-        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
-            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-            const int16_t **chrUSrcPtr= (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **chrVSrcPtr= (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
-                yuv2nv12X_c(c, vLumFilter+dstY*vLumFilterSize,
-                            lumSrcPtr, vLumFilterSize,
-                            vChrFilter+chrDstY*vChrFilterSize,
-                            chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                            dest, uDest, dstW, chrDstW, dstFormat);
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
-                if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
-                    yuv2yuvX16_c(c, vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW,
-                                 dstFormat);
-                } else {
-                    yuv2yuvX_c(c, vLumFilter+dstY*vLumFilterSize,
-                               lumSrcPtr, vLumFilterSize,
-                               vChrFilter+chrDstY*vChrFilterSize,
-                               chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                               alpSrcPtr, dest, uDest, vDest, aDest,
-                               dstW, chrDstW);
-                }
-            } else {
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
-                assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
-                if(flags & SWS_FULL_CHR_H_INT) {
-                    yuv2rgbX_c_full(c,
-                                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                    vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                                    alpSrcPtr, dest, dstW, dstY);
-                } else {
-                    yuv2packedX_c(c,
-                                  vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                  vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
-                                  alpSrcPtr, dest, dstW, dstY);
-                }
-            }
          }
      }
  
@@ -1860,31 +1940,13 @@ static int swScale(SwsContext *c, const uint8_t* src[],
      return dstY - lastDstY;
  }
  
-static void sws_init_swScale_c(SwsContext *c)
+static av_cold void sws_init_swScale_c(SwsContext *c)
  {
-    enum PixelFormat srcFormat = c->srcFormat,
-                     dstFormat = c->dstFormat;
+    enum PixelFormat srcFormat = c->srcFormat;
  
-    c->yuv2nv12X    = yuv2nv12X_c;
-    if (is16BPS(dstFormat)) {
-        c->yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
-    } else if (is9_OR_10BPS(dstFormat)) {
-        if (dstFormat == PIX_FMT_YUV420P9BE || dstFormat == PIX_FMT_YUV420P9LE) {
-            c->yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
-        } else {
-            c->yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
-        }
-    } else {
-        c->yuv2yuv1     = yuv2yuv1_c;
-        c->yuv2yuvX     = yuv2yuvX_c;
-    }
-    if(c->flags & SWS_FULL_CHR_H_INT) {
-        c->yuv2packedX = yuv2rgbX_c_full;
-    } else {
-        c->yuv2packed1  = yuv2packed1_c;
-        c->yuv2packed2  = yuv2packed2_c;
-        c->yuv2packedX  = yuv2packedX_c;
-    }
+    find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
+                                   &c->yuv2packed1, &c->yuv2packed2,
+                                   &c->yuv2packedX);
  
      c->hScale       = hScale_c;