libdc1394: add a private option for channel.

[ffmpeg] / libswscale / swscale.c
diff --git a/libswscale/swscale.c b/libswscale/swscale.c

index 565743a22973707c19560409c0b7ebf7ed4a39a3..36b676bf0623f89854b443ae3e637c9c15b837c6 100644 (file)
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -62,6 +62,7 @@ untested special converters
  #include "rgb2rgb.h"
  #include "libavutil/intreadwrite.h"
  #include "libavutil/x86_cpu.h"
+#include "libavutil/cpu.h"
  #include "libavutil/avutil.h"
  #include "libavutil/mathematics.h"
  #include "libavutil/bswap.h"
@@ -70,10 +71,6 @@ untested special converters
  #undef MOVNTQ
  #undef PAVGB
  
-//#undef HAVE_MMX2
-//#define HAVE_AMD3DNOW
-//#undef HAVE_MMX
-//#undef ARCH_X86
  #define DITHER1XBPP
  
  #define isPacked(x)         (       \
@@ -121,63 +118,6 @@ add BGR4 output support
  write special BGR->BGR scaler
  */
  
-#if ARCH_X86
-DECLARE_ASM_CONST(8, uint64_t, bF8)=       0xF8F8F8F8F8F8F8F8LL;
-DECLARE_ASM_CONST(8, uint64_t, bFC)=       0xFCFCFCFCFCFCFCFCLL;
-DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
-DECLARE_ASM_CONST(8, uint64_t, w02)=       0x0002000200020002LL;
-DECLARE_ASM_CONST(8, uint64_t, bm00001111)=0x00000000FFFFFFFFLL;
-DECLARE_ASM_CONST(8, uint64_t, bm00000111)=0x0000000000FFFFFFLL;
-DECLARE_ASM_CONST(8, uint64_t, bm11111000)=0xFFFFFFFFFF000000LL;
-DECLARE_ASM_CONST(8, uint64_t, bm01010101)=0x00FF00FF00FF00FFLL;
-
-const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
-        0x0103010301030103LL,
-        0x0200020002000200LL,};
-
-const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
-        0x0602060206020602LL,
-        0x0004000400040004LL,};
-
-DECLARE_ASM_CONST(8, uint64_t, b16Mask)=   0x001F001F001F001FLL;
-DECLARE_ASM_CONST(8, uint64_t, g16Mask)=   0x07E007E007E007E0LL;
-DECLARE_ASM_CONST(8, uint64_t, r16Mask)=   0xF800F800F800F800LL;
-DECLARE_ASM_CONST(8, uint64_t, b15Mask)=   0x001F001F001F001FLL;
-DECLARE_ASM_CONST(8, uint64_t, g15Mask)=   0x03E003E003E003E0LL;
-DECLARE_ASM_CONST(8, uint64_t, r15Mask)=   0x7C007C007C007C00LL;
-
-DECLARE_ALIGNED(8, const uint64_t, ff_M24A)         = 0x00FF0000FF0000FFLL;
-DECLARE_ALIGNED(8, const uint64_t, ff_M24B)         = 0xFF0000FF0000FF00LL;
-DECLARE_ALIGNED(8, const uint64_t, ff_M24C)         = 0x0000FF0000FF0000LL;
-
-#ifdef FAST_BGR2YV12
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff)   = 0x000000210041000DULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff)   = 0x0000FFEEFFDC0038ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff)   = 0x00000038FFD2FFF8ULL;
-#else
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff)   = 0x000020E540830C8BULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff)   = 0x0000ED0FDAC23831ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff)   = 0x00003831D0E6F6EAULL;
-#endif /* FAST_BGR2YV12 */
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
-
-DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toY1Coeff) = 0x0C88000040870C88ULL;
-DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toY2Coeff) = 0x20DE4087000020DEULL;
-DECLARE_ASM_CONST(8, uint64_t, ff_rgb24toY1Coeff) = 0x20DE0000408720DEULL;
-DECLARE_ASM_CONST(8, uint64_t, ff_rgb24toY2Coeff) = 0x0C88408700000C88ULL;
-DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toYOffset) = 0x0008400000084000ULL;
-
-DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toUV)[2][4] = {
-    {0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL, 0x3838D0E300003838ULL},
-    {0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL, 0x38380000D0E33838ULL, 0xF6E4D0E30000F6E4ULL},
-};
-
-DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toUVOffset)= 0x0040400000404000ULL;
-
-#endif /* ARCH_X86 */
-
  DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  {  1,   3,   1,   3,   1,   3,   1,   3, },
  {  2,   0,   2,   0,   2,   0,   2,   0, },
@@ -269,29 +209,40 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
  static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
                                                      const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
                                                      const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest,
-                                                    int dstW, int chrDstW, int big_endian)
+                                                    int dstW, int chrDstW, int big_endian, int output_bits)
  {
      //FIXME Optimize (just quickly written not optimized..)
      int i;
-
+    int shift = 11 + 16 - output_bits;
+
+#define output_pixel(pos, val) \
+    if (big_endian) { \
+        if (output_bits == 16) { \
+            AV_WB16(pos, av_clip_uint16(val >> shift)); \
+        } else { \
+            AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+        } \
+    } else { \
+        if (output_bits == 16) { \
+            AV_WL16(pos, av_clip_uint16(val >> shift)); \
+        } else { \
+            AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+        } \
+    }
      for (i = 0; i < dstW; i++) {
-        int val = 1 << 10;
+        int val = 1 << (26-output_bits);
          int j;
  
          for (j = 0; j < lumFilterSize; j++)
              val += lumSrc[j][i] * lumFilter[j];
  
-        if (big_endian) {
-            AV_WB16(&dest[i], av_clip_uint16(val >> 11));
-        } else {
-            AV_WL16(&dest[i], av_clip_uint16(val >> 11));
-        }
+        output_pixel(&dest[i], val);
      }
  
      if (uDest) {
          for (i = 0; i < chrDstW; i++) {
-            int u = 1 << 10;
-            int v = 1 << 10;
+            int u = 1 << (26-output_bits);
+            int v = 1 << (26-output_bits);
              int j;
  
              for (j = 0; j < chrFilterSize; j++) {
@@ -299,51 +250,72 @@ static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, co
                  v += chrSrc[j][i + VOFW] * chrFilter[j];
              }
  
-            if (big_endian) {
-                AV_WB16(&uDest[i], av_clip_uint16(u >> 11));
-                AV_WB16(&vDest[i], av_clip_uint16(v >> 11));
-            } else {
-                AV_WL16(&uDest[i], av_clip_uint16(u >> 11));
-                AV_WL16(&vDest[i], av_clip_uint16(v >> 11));
-            }
+            output_pixel(&uDest[i], u);
+            output_pixel(&vDest[i], v);
          }
      }
  
      if (CONFIG_SWSCALE_ALPHA && aDest) {
          for (i = 0; i < dstW; i++) {
-            int val = 1 << 10;
+            int val = 1 << (26-output_bits);
              int j;
  
              for (j = 0; j < lumFilterSize; j++)
                  val += alpSrc[j][i] * lumFilter[j];
  
-            if (big_endian) {
-                AV_WB16(&aDest[i], av_clip_uint16(val >> 11));
-            } else {
-                AV_WL16(&aDest[i], av_clip_uint16(val >> 11));
-            }
+            output_pixel(&aDest[i], val);
          }
      }
  }
  
+#define yuv2NBPS(bits, BE_LE, is_be) \
+static void yuv2yuvX ## bits ## BE_LE ## _c(const int16_t *lumFilter, \
+                              const int16_t **lumSrc, int lumFilterSize, \
+                              const int16_t *chrFilter, const int16_t **chrSrc, \
+                              int chrFilterSize, const int16_t **alpSrc, \
+                              uint16_t *dest, uint16_t *uDest, uint16_t *vDest, \
+                              uint16_t *aDest, int dstW, int chrDstW) \
+{ \
+    yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize, \
+                           chrFilter, chrSrc, chrFilterSize, \
+                           alpSrc, \
+                           dest, uDest, vDest, aDest, \
+                           dstW, chrDstW, is_be, bits); \
+}
+yuv2NBPS( 9, BE, 1);
+yuv2NBPS( 9, LE, 0);
+yuv2NBPS(10, BE, 1);
+yuv2NBPS(10, LE, 0);
+yuv2NBPS(16, BE, 1);
+yuv2NBPS(16, LE, 0);
+
  static inline void yuv2yuvX16inC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
                                   const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
                                   const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest, int dstW, int chrDstW,
                                   enum PixelFormat dstFormat)
  {
-    if (isBE(dstFormat)) {
-        yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
-                               chrFilter, chrSrc, chrFilterSize,
-                               alpSrc,
-                               dest, uDest, vDest, aDest,
-                               dstW, chrDstW, 1);
+#define conv16(bits) \
+    if (isBE(dstFormat)) { \
+        yuv2yuvX ## bits ## BE_c(lumFilter, lumSrc, lumFilterSize, \
+                               chrFilter, chrSrc, chrFilterSize, \
+                               alpSrc, \
+                               dest, uDest, vDest, aDest, \
+                               dstW, chrDstW); \
+    } else { \
+        yuv2yuvX ## bits ## LE_c(lumFilter, lumSrc, lumFilterSize, \
+                               chrFilter, chrSrc, chrFilterSize, \
+                               alpSrc, \
+                               dest, uDest, vDest, aDest, \
+                               dstW, chrDstW); \
+    }
+    if (is16BPS(dstFormat)) {
+        conv16(16);
+    } else if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
+        conv16(9);
      } else {
-        yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
-                               chrFilter, chrSrc, chrFilterSize,
-                               alpSrc,
-                               dest, uDest, vDest, aDest,
-                               dstW, chrDstW, 0);
+        conv16(10);
      }
+#undef conv16
  }
  
  static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
@@ -696,6 +668,18 @@ static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc
              dest+=12;\
          }\
          break;\
+    case PIX_FMT_BGR48BE:\
+    case PIX_FMT_BGR48LE:\
+        func(uint8_t,0)\
+            ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
+            ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
+            ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
+            ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
+            ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
+            ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
+            dest+=12;\
+        }\
+        break;\
      case PIX_FMT_RGBA:\
      case PIX_FMT_BGRA:\
          if (CONFIG_SMALL) {\
@@ -1033,6 +1017,49 @@ static inline void rgb48ToUV_half(uint8_t *dstU, uint8_t *dstV,
      }
  }
  
+static inline void bgr48ToY(uint8_t *dst, const uint8_t *src, long width,
+                            uint32_t *unused)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        int b = src[i*6+0];
+        int g = src[i*6+2];
+        int r = src[i*6+4];
+
+        dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static inline void bgr48ToUV(uint8_t *dstU, uint8_t *dstV,
+                             const uint8_t *src1, const uint8_t *src2,
+                             long width, uint32_t *unused)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        int b = src1[6*i + 0];
+        int g = src1[6*i + 2];
+        int r = src1[6*i + 4];
+
+        dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+        dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+    }
+}
+
+static inline void bgr48ToUV_half(uint8_t *dstU, uint8_t *dstV,
+                                  const uint8_t *src1, const uint8_t *src2,
+                                  long width, uint32_t *unused)
+{
+    int i;
+    for (i = 0; i < width; i++) {
+        int b= src1[12*i + 0] + src1[12*i + 6];
+        int g= src1[12*i + 2] + src1[12*i + 8];
+        int r= src1[12*i + 4] + src1[12*i + 10];
+
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
+    }
+}
+
  #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
  static inline void name(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)\
  {\
@@ -1149,162 +1176,58 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
  
  //Note: we have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW+MMX2 one
  //Plain C versions
-#if CONFIG_RUNTIME_CPUDETECT
-#  define COMPILE_C 1
-#  if   ARCH_X86
-#    define COMPILE_MMX     HAVE_MMX
-#    define COMPILE_MMX2    HAVE_MMX2
-#    define COMPILE_3DNOW   HAVE_AMD3DNOW
-#  elif ARCH_PPC
-#    define COMPILE_ALTIVEC HAVE_ALTIVEC
-#  endif
-#else /* CONFIG_RUNTIME_CPUDETECT */
-#  if   ARCH_X86
-#    if   HAVE_MMX2
-#      define COMPILE_MMX2  1
-#    elif HAVE_AMD3DNOW
-#      define COMPILE_3DNOW 1
-#    elif HAVE_MMX
-#      define COMPILE_MMX   1
-#    else
-#      define COMPILE_C     1
-#    endif
-#  elif ARCH_PPC && HAVE_ALTIVEC
-#    define COMPILE_ALTIVEC 1
-#  else
-#    define COMPILE_C       1
-#  endif
-#endif
-
-#ifndef COMPILE_C
-#  define COMPILE_C 0
-#endif
-#ifndef COMPILE_MMX
-#  define COMPILE_MMX 0
-#endif
-#ifndef COMPILE_MMX2
-#  define COMPILE_MMX2 0
-#endif
-#ifndef COMPILE_3DNOW
-#  define COMPILE_3DNOW 0
-#endif
-#ifndef COMPILE_ALTIVEC
-#  define COMPILE_ALTIVEC 0
-#endif
  
-#define COMPILE_TEMPLATE_MMX 0
  #define COMPILE_TEMPLATE_MMX2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 0
  #define COMPILE_TEMPLATE_ALTIVEC 0
  
-#if COMPILE_C
-#define RENAME(a) a ## _C
  #include "swscale_template.c"
-#endif
  
-#if COMPILE_ALTIVEC
+#if HAVE_ALTIVEC
  #undef RENAME
  #undef COMPILE_TEMPLATE_ALTIVEC
  #define COMPILE_TEMPLATE_ALTIVEC 1
  #define RENAME(a) a ## _altivec
-#include "swscale_template.c"
+#include "ppc/swscale_template.c"
  #endif
  
-#if ARCH_X86
-
  //MMX versions
-#if COMPILE_MMX
+#if HAVE_MMX
  #undef RENAME
-#undef COMPILE_TEMPLATE_MMX
  #undef COMPILE_TEMPLATE_MMX2
-#undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX 1
  #define COMPILE_TEMPLATE_MMX2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 0
  #define RENAME(a) a ## _MMX
-#include "swscale_template.c"
+#include "x86/swscale_template.c"
  #endif
  
  //MMX2 versions
-#if COMPILE_MMX2
+#if HAVE_MMX2
  #undef RENAME
-#undef COMPILE_TEMPLATE_MMX
  #undef COMPILE_TEMPLATE_MMX2
-#undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX 1
  #define COMPILE_TEMPLATE_MMX2 1
-#define COMPILE_TEMPLATE_AMD3DNOW 0
  #define RENAME(a) a ## _MMX2
-#include "swscale_template.c"
+#include "x86/swscale_template.c"
  #endif
  
-//3DNOW versions
-#if COMPILE_3DNOW
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMX
-#undef COMPILE_TEMPLATE_MMX2
-#undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX 1
-#define COMPILE_TEMPLATE_MMX2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 1
-#define RENAME(a) a ## _3DNow
-#include "swscale_template.c"
-#endif
-
-#endif //ARCH_X86
-
  SwsFunc ff_getSwsFunc(SwsContext *c)
  {
-#if CONFIG_RUNTIME_CPUDETECT
-    int flags = c->flags;
+    int cpu_flags = av_get_cpu_flags();
  
-#if ARCH_X86
-    // ordered per speed fastest first
-    if (flags & SWS_CPU_CAPS_MMX2) {
-        sws_init_swScale_MMX2(c);
-        return swScale_MMX2;
-    } else if (flags & SWS_CPU_CAPS_3DNOW) {
-        sws_init_swScale_3DNow(c);
-        return swScale_3DNow;
-    } else if (flags & SWS_CPU_CAPS_MMX) {
-        sws_init_swScale_MMX(c);
-        return swScale_MMX;
-    } else {
-        sws_init_swScale_C(c);
-        return swScale_C;
-    }
+    sws_init_swScale_c(c);
  
-#else
-#if COMPILE_ALTIVEC
-    if (flags & SWS_CPU_CAPS_ALTIVEC) {
-        sws_init_swScale_altivec(c);
-        return swScale_altivec;
-    } else {
-        sws_init_swScale_C(c);
-        return swScale_C;
-    }
+#if HAVE_MMX
+    if (cpu_flags & AV_CPU_FLAG_MMX)
+        sws_init_swScale_MMX(c);
  #endif
-    sws_init_swScale_C(c);
-    return swScale_C;
-#endif /* ARCH_X86 */
-#else //CONFIG_RUNTIME_CPUDETECT
-#if   COMPILE_TEMPLATE_MMX2
-    sws_init_swScale_MMX2(c);
-    return swScale_MMX2;
-#elif COMPILE_TEMPLATE_AMD3DNOW
-    sws_init_swScale_3DNow(c);
-    return swScale_3DNow;
-#elif COMPILE_TEMPLATE_MMX
-    sws_init_swScale_MMX(c);
-    return swScale_MMX;
-#elif COMPILE_TEMPLATE_ALTIVEC
-    sws_init_swScale_altivec(c);
-    return swScale_altivec;
-#else
-    sws_init_swScale_C(c);
-    return swScale_C;
+#if HAVE_MMX2
+    if (cpu_flags & AV_CPU_FLAG_MMX2)
+        sws_init_swScale_MMX2(c);
  #endif
-#endif //!CONFIG_RUNTIME_CPUDETECT
+#if HAVE_ALTIVEC
+    if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
+        sws_init_swScale_altivec(c);
+#endif
+
+    return swScale_c;
  }
  
  static void copyPlane(const uint8_t *src, int srcStride,
@@ -1683,7 +1606,127 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[
                  length*=2;
              fillPlane(dst[plane], dstStride[plane], length, height, y, (plane==3) ? 255 : 128);
          } else {
-            if(is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
+            if(is9_OR_10BPS(c->srcFormat)) {
+                const int src_depth = av_pix_fmt_descriptors[c->srcFormat].comp[plane].depth_minus1+1;
+                const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1+1;
+                const uint16_t *srcPtr2 = (const uint16_t*)srcPtr;
+
+                if (is16BPS(c->dstFormat)) {
+                    uint16_t *dstPtr2 = (uint16_t*)dstPtr;
+#define COPY9_OR_10TO16(rfunc, wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            int srcpx = rfunc(&srcPtr2[j]); \
+                            wfunc(&dstPtr2[j], (srcpx<<(16-src_depth)) | (srcpx>>(2*src_depth-16))); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO16(AV_RB16, AV_WB16);
+                        } else {
+                            COPY9_OR_10TO16(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO16(AV_RB16, AV_WL16);
+                        } else {
+                            COPY9_OR_10TO16(AV_RL16, AV_WL16);
+                        }
+                    }
+                } else if (is9_OR_10BPS(c->dstFormat)) {
+                    uint16_t *dstPtr2 = (uint16_t*)dstPtr;
+#define COPY9_OR_10TO9_OR_10(loop) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            loop; \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+#define COPY9_OR_10TO9_OR_10_2(rfunc, wfunc) \
+                    if (dst_depth > src_depth) { \
+                        COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \
+                            wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \
+                    } else if (dst_depth < src_depth) { \
+                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]) >> 1)); \
+                    } else { \
+                        COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]))); \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WB16);
+                        } else {
+                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WL16);
+                        } else {
+                            COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WL16);
+                        }
+                    }
+                } else {
+                    // FIXME Maybe dither instead.
+#define COPY9_OR_10TO8(rfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            dstPtr[j] = rfunc(&srcPtr2[j])>>(src_depth-8); \
+                        } \
+                        dstPtr  += dstStride[plane]; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->srcFormat)) {
+                        COPY9_OR_10TO8(AV_RB16);
+                    } else {
+                        COPY9_OR_10TO8(AV_RL16);
+                    }
+                }
+            } else if(is9_OR_10BPS(c->dstFormat)) {
+                const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1+1;
+                uint16_t *dstPtr2 = (uint16_t*)dstPtr;
+
+                if (is16BPS(c->srcFormat)) {
+                    const uint16_t *srcPtr2 = (const uint16_t*)srcPtr;
+#define COPY16TO9_OR_10(rfunc, wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            wfunc(&dstPtr2[j], rfunc(&srcPtr2[j])>>(16-dst_depth)); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr2 += srcStride[plane]/2; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        if (isBE(c->srcFormat)) {
+                            COPY16TO9_OR_10(AV_RB16, AV_WB16);
+                        } else {
+                            COPY16TO9_OR_10(AV_RL16, AV_WB16);
+                        }
+                    } else {
+                        if (isBE(c->srcFormat)) {
+                            COPY16TO9_OR_10(AV_RB16, AV_WL16);
+                        } else {
+                            COPY16TO9_OR_10(AV_RL16, AV_WL16);
+                        }
+                    }
+                } else /* 8bit */ {
+#define COPY8TO9_OR_10(wfunc) \
+                    for (i = 0; i < height; i++) { \
+                        for (j = 0; j < length; j++) { \
+                            const int srcpx = srcPtr[j]; \
+                            wfunc(&dstPtr2[j], (srcpx<<(dst_depth-8)) | (srcpx >> (16-dst_depth))); \
+                        } \
+                        dstPtr2 += dstStride[plane]/2; \
+                        srcPtr  += srcStride[plane]; \
+                    }
+                    if (isBE(c->dstFormat)) {
+                        COPY8TO9_OR_10(AV_WB16);
+                    } else {
+                        COPY8TO9_OR_10(AV_WL16);
+                    }
+                }
+            } else if(is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
                  if (!isBE(c->srcFormat)) srcPtr++;
                  for (i=0; i<height; i++) {
                      for (j=0; j<length; j++) dstPtr[j] = srcPtr[j<<1];
@@ -1726,23 +1769,6 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[
      return srcSliceH;
  }
  
-int ff_hardcodedcpuflags(void)
-{
-    int flags = 0;
-#if   COMPILE_TEMPLATE_MMX2
-    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
-#elif COMPILE_TEMPLATE_AMD3DNOW
-    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
-#elif COMPILE_TEMPLATE_MMX
-    flags |= SWS_CPU_CAPS_MMX;
-#elif COMPILE_TEMPLATE_ALTIVEC
-    flags |= SWS_CPU_CAPS_ALTIVEC;
-#elif ARCH_BFIN
-    flags |= SWS_CPU_CAPS_BFIN;
-#endif
-    return flags;
-}
-
  void ff_get_unscaled_swscale(SwsContext *c)
  {
      const enum PixelFormat srcFormat = c->srcFormat;
@@ -1786,6 +1812,8 @@ void ff_get_unscaled_swscale(SwsContext *c)
          && srcFormat != PIX_FMT_MONOWHITE && dstFormat != PIX_FMT_MONOWHITE
          && srcFormat != PIX_FMT_RGB48LE   && dstFormat != PIX_FMT_RGB48LE
          && srcFormat != PIX_FMT_RGB48BE   && dstFormat != PIX_FMT_RGB48BE
+        && srcFormat != PIX_FMT_BGR48LE   && dstFormat != PIX_FMT_BGR48LE
+        && srcFormat != PIX_FMT_BGR48BE   && dstFormat != PIX_FMT_BGR48BE
          && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
          c->swScale= rgbToRgbWrapper;
  
@@ -1824,8 +1852,8 @@ void ff_get_unscaled_swscale(SwsContext *c)
      if(srcFormat == PIX_FMT_UYVY422 && dstFormat == PIX_FMT_YUV422P)
          c->swScale= uyvyToYuv422Wrapper;
  
-#if COMPILE_ALTIVEC
-    if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
+#if HAVE_ALTIVEC
+    if ((av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) &&
          !(c->flags & SWS_BITEXACT) &&
          srcFormat == PIX_FMT_YUV420P) {
          // unscaled YV12 -> packed YUV, we want speed
@@ -1855,8 +1883,7 @@ void ff_get_unscaled_swscale(SwsContext *c)
              c->swScale= planarCopyWrapper;
      }
  #if ARCH_BFIN
-    if (flags & SWS_CPU_CAPS_BFIN)
-        ff_bfin_get_unscaled_swscale (c);
+    ff_bfin_get_unscaled_swscale (c);
  #endif
  }
  
@@ -2021,14 +2048,6 @@ int sws_scale(SwsContext *c, const uint8_t* const src[], const int srcStride[],
      }
  }
  
-#if LIBSWSCALE_VERSION_MAJOR < 1
-int sws_scale_ordered(SwsContext *c, const uint8_t* const src[], int srcStride[], int srcSliceY,
-                      int srcSliceH, uint8_t* dst[], int dstStride[])
-{
-    return sws_scale(c, src, srcStride, srcSliceY, srcSliceH, dst, dstStride);
-}
-#endif
-
  /* Convert the palette to the same packed 32-bit format as the palette */
  void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
  {