100l, forgot to change an ifdef on last commit

[ffmpeg] / postproc / swscale.c
diff --git a/postproc/swscale.c b/postproc/swscale.c

index 98cd469370256e92cfd730fdbcdd21f38e61cb39..71e6ff515fe1843f2170a3e4eee87139d0c060fc 100644 (file)
--- a/postproc/swscale.c
+++ b/postproc/swscale.c
@@ -53,6 +53,7 @@ untested special converters
  #include <string.h>
  #include <math.h>
  #include <stdio.h>
+#include <unistd.h>
  #include "../config.h"
  #include "../mangle.h"
  #include <assert.h>
@@ -61,6 +62,12 @@ untested special converters
  #else
  #include <stdlib.h>
  #endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#endif
  #include "swscale.h"
  #include "swscale_internal.h"
  #include "../cpudetect.h"
@@ -97,6 +104,7 @@ untested special converters
  
  //FIXME replace this with something faster
  #define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_YVU9 \
+                       || (x)==IMGFMT_NV12 || (x)==IMGFMT_NV21 \
                         || (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P)
  #define isYUV(x)       ((x)==IMGFMT_UYVY || (x)==IMGFMT_YUY2 || isPlanarYUV(x))
  #define isGray(x)      ((x)==IMGFMT_Y800)
@@ -110,6 +118,7 @@ untested special converters
  #define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY\
                         || (x)==IMGFMT_444P || (x)==IMGFMT_422P || (x)==IMGFMT_411P\
                         || isRGB(x) || isBGR(x)\
+                       || (x)==IMGFMT_NV12 || (x)==IMGFMT_NV21\
                         || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
  #define isPacked(x)    ((x)==IMGFMT_YUY2 || (x)==IMGFMT_UYVY ||isRGB(x) || isBGR(x))
  
@@ -145,20 +154,20 @@ write special BGR->BGR scaler
  #define MIN(a,b) ((a) > (b) ? (b) : (a))
  #define MAX(a,b) ((a) < (b) ? (b) : (a))
  
-#ifdef ARCH_X86
-static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
-static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static uint64_t attribute_used __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
+static uint64_t attribute_used __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
  static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
-static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
-static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
-static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
-static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
-static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
+static uint64_t attribute_used __attribute__((aligned(8))) w02=       0x0002000200020002LL;
+static uint64_t attribute_used __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
+static uint64_t attribute_used __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
+static uint64_t attribute_used __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
+static uint64_t attribute_used __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
  
-static volatile uint64_t __attribute__((aligned(8))) b5Dither;
-static volatile uint64_t __attribute__((aligned(8))) g5Dither;
-static volatile uint64_t __attribute__((aligned(8))) g6Dither;
-static volatile uint64_t __attribute__((aligned(8))) r5Dither;
+static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
+static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
+static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
+static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither;
  
  static uint64_t __attribute__((aligned(8))) dither4[2]={
         0x0103010301030103LL,
@@ -169,28 +178,28 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
         0x0004000400040004LL,};
  
  static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
-static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
-static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
+static uint64_t attribute_used __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
+static uint64_t attribute_used __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
  static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
-static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
-static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
+static uint64_t attribute_used __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
+static uint64_t attribute_used __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
  
-static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
-static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
-static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
+static uint64_t attribute_used __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
+static uint64_t attribute_used __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
+static uint64_t attribute_used __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
  
  #ifdef FAST_BGR2YV12
-static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
-static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
-static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
+static const uint64_t bgr2YCoeff  attribute_used __attribute__((aligned(8))) = 0x000000210041000DULL;
+static const uint64_t bgr2UCoeff  attribute_used __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
+static const uint64_t bgr2VCoeff  attribute_used __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
  #else
-static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
-static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
-static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
+static const uint64_t bgr2YCoeff  attribute_used __attribute__((aligned(8))) = 0x000020E540830C8BULL;
+static const uint64_t bgr2UCoeff  attribute_used __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
+static const uint64_t bgr2VCoeff  attribute_used __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
  #endif
-static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
-static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
-static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
+static const uint64_t bgr2YOffset attribute_used __attribute__((aligned(8))) = 0x1010101010101010ULL;
+static const uint64_t bgr2UVOffset attribute_used __attribute__((aligned(8)))= 0x8080808080808080ULL;
+static const uint64_t w1111       attribute_used __attribute__((aligned(8))) = 0x0001000100010001ULL;
  #endif
  
  // clipping helper table for C implementations:
@@ -204,7 +213,7 @@ extern const uint8_t dither_8x8_32[8][8];
  extern const uint8_t dither_8x8_73[8][8];
  extern const uint8_t dither_8x8_220[8][8];
  
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
  void in_asm_used_var_warning_killer()
  {
   volatile int i= bF8+bFC+w10+
@@ -247,6 +256,56 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
                 }
  }
  
+static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
+                               int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+                               uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
+{
+       //FIXME Optimize (just quickly writen not opti..)
+       int i;
+       for(i=0; i<dstW; i++)
+       {
+               int val=1<<18;
+               int j;
+               for(j=0; j<lumFilterSize; j++)
+                       val += lumSrc[j][i] * lumFilter[j];
+
+               dest[i]= MIN(MAX(val>>19, 0), 255);
+       }
+
+       if(uDest == NULL)
+               return;
+
+       if(dstFormat == IMGFMT_NV12)
+               for(i=0; i<chrDstW; i++)
+               {
+                       int u=1<<18;
+                       int v=1<<18;
+                       int j;
+                       for(j=0; j<chrFilterSize; j++)
+                       {
+                               u += chrSrc[j][i] * chrFilter[j];
+                               v += chrSrc[j][i + 2048] * chrFilter[j];
+                       }
+
+                       uDest[2*i]= MIN(MAX(u>>19, 0), 255);
+                       uDest[2*i+1]= MIN(MAX(v>>19, 0), 255);
+               }
+       else
+               for(i=0; i<chrDstW; i++)
+               {
+                       int u=1<<18;
+                       int v=1<<18;
+                       int j;
+                       for(j=0; j<chrFilterSize; j++)
+                       {
+                               u += chrSrc[j][i] * chrFilter[j];
+                               v += chrSrc[j][i + 2048] * chrFilter[j];
+                       }
+
+                       uDest[2*i]= MIN(MAX(v>>19, 0), 255);
+                       uDest[2*i+1]= MIN(MAX(u>>19, 0), 255);
+               }
+}
  
  #define YSCALE_YUV_2_PACKEDX_C(type) \
                 for(i=0; i<(dstW>>1); i++){\
@@ -353,7 +412,7 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
                         ((uint8_t*)dest)[3]= r[Y2];\
                         ((uint8_t*)dest)[4]= g[Y2];\
                         ((uint8_t*)dest)[5]= b[Y2];\
-                       ((uint8_t*)dest)+=6;\
+                       dest+=6;\
                 }\
                 break;\
         case IMGFMT_BGR24:\
@@ -364,7 +423,7 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
                         ((uint8_t*)dest)[3]= b[Y2];\
                         ((uint8_t*)dest)[4]= g[Y2];\
                         ((uint8_t*)dest)[5]= r[Y2];\
-                       ((uint8_t*)dest)+=6;\
+                       dest+=6;\
                 }\
                 break;\
         case IMGFMT_RGB16:\
@@ -446,7 +505,7 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
                                 acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
                                 acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
                                 ((uint8_t*)dest)[0]= acc;\
-                               ((uint8_t*)dest)++;\
+                               dest++;\
                         }\
  \
  /*\
@@ -536,7 +595,7 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
                         ((uint8_t*)dest)[3]= r[Y2];
                         ((uint8_t*)dest)[4]= g[Y2];
                         ((uint8_t*)dest)[5]= b[Y2];
-                       ((uint8_t*)dest)+=6;
+                       dest+=6;
                 }
                 break;
         case IMGFMT_BGR24:
@@ -547,7 +606,7 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
                         ((uint8_t*)dest)[3]= b[Y2];
                         ((uint8_t*)dest)[4]= g[Y2];
                         ((uint8_t*)dest)[5]= r[Y2];
-                       ((uint8_t*)dest)+=6;
+                       dest+=6;
                 }
                 break;
         case IMGFMT_RGB16:
@@ -642,7 +701,7 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
                                 acc+= acc + g[Y2+d128[(i+1)&7]];
                                 if((i&7)==6){
                                         ((uint8_t*)dest)[0]= acc;
-                                       ((uint8_t*)dest)++;
+                                       dest++;
                                 }
                         }
                 }
@@ -679,7 +738,7 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
  #endif //HAVE_ALTIVEC
  #endif //ARCH_POWERPC
  
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
  
  #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
  #define COMPILE_MMX
@@ -692,7 +751,7 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
  #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
  #define COMPILE_3DNOW
  #endif
-#endif //ARCH_X86
+#endif //ARCH_X86 || ARCH_X86_64
  
  #undef HAVE_MMX
  #undef HAVE_MMX2
@@ -716,7 +775,7 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
  #endif
  #endif //ARCH_POWERPC
  
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
  
  //X86 versions
  /*
@@ -758,7 +817,7 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
  #include "swscale_template.c"
  #endif
  
-#endif //ARCH_X86
+#endif //ARCH_X86 || ARCH_X86_64
  
  // minor note: the HAVE_xyz is messed up after that line so don't use it
  
@@ -775,7 +834,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist
  
  static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
                               int srcW, int dstW, int filterAlign, int one, int flags,
-                             SwsVector *srcFilter, SwsVector *dstFilter)
+                             SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
  {
         int i;
         int filterSize;
@@ -783,7 +842,7 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
         int minFilterSize;
         double *filter=NULL;
         double *filter2=NULL;
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
         if(flags & SWS_CPU_CAPS_MMX)
                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
  #endif
@@ -855,13 +914,12 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
                 double xDstInSrc;
                 double sizeFactor, filterSizeInSrc;
                 const double xInc1= (double)xInc / (double)(1<<16);
-               int param= (flags&SWS_PARAM_MASK)>>SWS_PARAM_SHIFT;
  
                 if     (flags&SWS_BICUBIC)      sizeFactor= 4.0;
                 else if(flags&SWS_X)            sizeFactor= 8.0;
                 else if(flags&SWS_AREA)         sizeFactor= 1.0; //downscale only, for upscale it is bilinear
                 else if(flags&SWS_GAUSS)        sizeFactor= 8.0;   // infinite ;)
-               else if(flags&SWS_LANCZOS)      sizeFactor= param ? 2.0*param : 6.0;
+               else if(flags&SWS_LANCZOS)      sizeFactor= param[0] != SWS_PARAM_DEFAULT ? 2.0*param[0] : 6.0;
                 else if(flags&SWS_SINC)         sizeFactor= 20.0; // infinite ;)
                 else if(flags&SWS_SPLINE)       sizeFactor= 20.0;  // infinite ;)
                 else if(flags&SWS_BILINEAR)     sizeFactor= 2.0;
@@ -890,13 +948,13 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
                                 double coeff;
                                 if(flags & SWS_BICUBIC)
                                 {
-                                       double A= param ? -param*0.01 : -0.60;
-                                       
-                                       // Equation is from VirtualDub
-                                       if(d<1.0)
-                                               coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
+                                       double B= param[0] != SWS_PARAM_DEFAULT ? param[0] : 0.0;
+                                       double C= param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6;
+
+                                       if(d<1.0) 
+                                               coeff = (12-9*B-6*C)*d*d*d + (-18+12*B+6*C)*d*d + 6-2*B;
                                         else if(d<2.0)
-                                               coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
+                                               coeff = (-B-6*C)*d*d*d + (6*B+30*C)*d*d + (-12*B-48*C)*d +8*B+24*C;
                                         else
                                                 coeff=0.0;
                                 }
@@ -908,7 +966,7 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
                                 }*/
                                 else if(flags & SWS_X)
                                 {
-                                       double A= param ? param*0.1 : 1.0;
+                                       double A= param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
                                         
                                         if(d<1.0)
                                                 coeff = cos(d*PI);
@@ -927,7 +985,7 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
                                 }
                                 else if(flags & SWS_GAUSS)
                                 {
-                                       double p= param ? param*0.1 : 3.0;
+                                       double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
                                         coeff = pow(2.0, - p*d*d);
                                 }
                                 else if(flags & SWS_SINC)
@@ -936,7 +994,7 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
                                 }
                                 else if(flags & SWS_LANCZOS)
                                 {
-                                       double p= param ? param : 3.0; 
+                                       double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0; 
                                         coeff = d ? sin(d*PI)*sin(d*PI/p)/(d*d*PI*PI/p) : 1.0;
                                         if(d>p) coeff=0;
                                 }
@@ -1038,6 +1096,21 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
                 if(min>minFilterSize) minFilterSize= min;
         }
  
+        if (flags & SWS_CPU_CAPS_ALTIVEC) {
+          // we can handle the special case 4,
+          // so we don't want to go to the full 8
+          if (minFilterSize < 5)
+            filterAlign = 4;
+
+          // we really don't want to waste our time
+          // doing useless computation, so fall-back on
+          // the scalar C code for very small filter.
+          // vectorizing is worth it only if you have
+          // decent-sized vector.
+          if (minFilterSize < 3)
+            filterAlign = 1;
+        }
+
         ASSERT(minFilterSize > 0)
         filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
         ASSERT(filterSize > 0)
@@ -1128,17 +1201,17 @@ static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *out
         free(filter);
  }
  
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
  static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
  {
         uint8_t *fragmentA;
-       int imm8OfPShufW1A;
-       int imm8OfPShufW2A;
-       int fragmentLengthA;
+       long imm8OfPShufW1A;
+       long imm8OfPShufW2A;
+       long fragmentLengthA;
         uint8_t *fragmentB;
-       int imm8OfPShufW1B;
-       int imm8OfPShufW2B;
-       int fragmentLengthB;
+       long imm8OfPShufW1B;
+       long imm8OfPShufW2B;
+       long fragmentLengthB;
         int fragmentPos;
  
         int xpos, i;
@@ -1151,9 +1224,9 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *fil
                 "jmp 9f                         \n\t"
         // Begin
                 "0:                             \n\t"
-               "movq (%%edx, %%eax), %%mm3     \n\t" 
-               "movd (%%ecx, %%esi), %%mm0     \n\t" 
-               "movd 1(%%ecx, %%esi), %%mm1    \n\t"
+               "movq (%%"REG_d", %%"REG_a"), %%mm3\n\t" 
+               "movd (%%"REG_c", %%"REG_S"), %%mm0\n\t" 
+               "movd 1(%%"REG_c", %%"REG_S"), %%mm1\n\t"
                 "punpcklbw %%mm7, %%mm1         \n\t"
                 "punpcklbw %%mm7, %%mm0         \n\t"
                 "pshufw $0xFF, %%mm1, %%mm1     \n\t"
@@ -1161,26 +1234,26 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *fil
                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
                 "2:                             \n\t"
                 "psubw %%mm1, %%mm0             \n\t"
-               "movl 8(%%ebx, %%eax), %%esi    \n\t"
+               "movl 8(%%"REG_b", %%"REG_a"), %%esi\n\t"
                 "pmullw %%mm3, %%mm0            \n\t"
                 "psllw $7, %%mm1                \n\t"
                 "paddw %%mm1, %%mm0             \n\t"
  
-               "movq %%mm0, (%%edi, %%eax)     \n\t"
+               "movq %%mm0, (%%"REG_D", %%"REG_a")\n\t"
  
-               "addl $8, %%eax                 \n\t"
+               "add $8, %%"REG_a"              \n\t"
         // End
                 "9:                             \n\t"
  //             "int $3\n\t"
-               "leal 0b, %0                    \n\t"
-               "leal 1b, %1                    \n\t"
-               "leal 2b, %2                    \n\t"
-               "decl %1                        \n\t"
-               "decl %2                        \n\t"
-               "subl %0, %1                    \n\t"
-               "subl %0, %2                    \n\t"
-               "leal 9b, %3                    \n\t"
-               "subl %0, %3                    \n\t"
+               "lea 0b, %0                     \n\t"
+               "lea 1b, %1                     \n\t"
+               "lea 2b, %2                     \n\t"
+               "dec %1                         \n\t"
+               "dec %2                         \n\t"
+               "sub %0, %1                     \n\t"
+               "sub %0, %2                     \n\t"
+               "lea 9b, %3                     \n\t"
+               "sub %0, %3                     \n\t"
  
  
                 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
@@ -1191,34 +1264,34 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *fil
                 "jmp 9f                         \n\t"
         // Begin
                 "0:                             \n\t"
-               "movq (%%edx, %%eax), %%mm3     \n\t" 
-               "movd (%%ecx, %%esi), %%mm0     \n\t" 
+               "movq (%%"REG_d", %%"REG_a"), %%mm3\n\t" 
+               "movd (%%"REG_c", %%"REG_S"), %%mm0\n\t" 
                 "punpcklbw %%mm7, %%mm0         \n\t"
                 "pshufw $0xFF, %%mm0, %%mm1     \n\t"
                 "1:                             \n\t"
                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
                 "2:                             \n\t"
                 "psubw %%mm1, %%mm0             \n\t"
-               "movl 8(%%ebx, %%eax), %%esi    \n\t"
+               "movl 8(%%"REG_b", %%"REG_a"), %%esi\n\t"
                 "pmullw %%mm3, %%mm0            \n\t"
                 "psllw $7, %%mm1                \n\t"
                 "paddw %%mm1, %%mm0             \n\t"
  
-               "movq %%mm0, (%%edi, %%eax)     \n\t"
+               "movq %%mm0, (%%"REG_D", %%"REG_a")\n\t"
  
-               "addl $8, %%eax                 \n\t"
+               "add $8, %%"REG_a"              \n\t"
         // End
                 "9:                             \n\t"
  //             "int $3\n\t"
-               "leal 0b, %0                    \n\t"
-               "leal 1b, %1                    \n\t"
-               "leal 2b, %2                    \n\t"
-               "decl %1                        \n\t"
-               "decl %2                        \n\t"
-               "subl %0, %1                    \n\t"
-               "subl %0, %2                    \n\t"
-               "leal 9b, %3                    \n\t"
-               "subl %0, %3                    \n\t"
+               "lea 0b, %0                     \n\t"
+               "lea 1b, %1                     \n\t"
+               "lea 2b, %2                     \n\t"
+               "dec %1                         \n\t"
+               "dec %2                         \n\t"
+               "sub %0, %1                     \n\t"
+               "sub %0, %2                     \n\t"
+               "lea 9b, %3                     \n\t"
+               "sub %0, %3                     \n\t"
  
  
                 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
@@ -1299,7 +1372,7 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *fil
         }
         filterPos[i/2]= xpos>>16; // needed to jump to the next part
  }
-#endif // ARCH_X86
+#endif // ARCH_X86 || ARCH_X86_64
  
  static void globalInit(){
      // generating tables:
@@ -1313,7 +1386,7 @@ static void globalInit(){
  static SwsFunc getSwsFunc(int flags){
      
  #ifdef RUNTIME_CPUDETECT
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
         // ordered per speed fasterst first
         if(flags & SWS_CPU_CAPS_MMX2)
                 return swScale_MMX2;
@@ -1361,13 +1434,16 @@ static int PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], i
                 uint8_t *dstPtr= dst;
                 for(i=0; i<srcSliceH; i++)
                 {
-                       memcpy(dstPtr, srcPtr, srcStride[0]);
+                       memcpy(dstPtr, srcPtr, c->srcW);
                         srcPtr+= srcStride[0];
                         dstPtr+= dstStride[0];
                 }
         }
-       dst = dstParam[1] + dstStride[1]*srcSliceY;
-       interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
+       dst = dstParam[1] + dstStride[1]*srcSliceY/2;
+       if (c->dstFormat == IMGFMT_NV12)
+               interleaveBytes( src[1],src[2],dst,c->srcW/2,srcSliceH/2,srcStride[1],srcStride[2],dstStride[0] );
+       else
+               interleaveBytes( src[2],src[1],dst,c->srcW/2,srcSliceH/2,srcStride[2],srcStride[1],dstStride[0] );
  
         return srcSliceH;
  }
@@ -1537,6 +1613,15 @@ static inline void sws_orderYUV(int format, uint8_t * sortedP[], int sortedStrid
                 sortedStride[0]= stride[0];
                 sortedStride[1]= stride[1];
                 sortedStride[2]= stride[2];
+       }
+       else if(format == IMGFMT_NV12 || format == IMGFMT_NV21)
+       {
+               sortedP[0]= p[0];
+               sortedP[1]= p[1];
+               sortedP[2]= NULL;
+               sortedStride[0]= stride[0];
+               sortedStride[1]= stride[1];
+               sortedStride[2]= 0;
         }else{
                 MSG_ERR("internal error in orderYUV\n");
         }
@@ -1627,6 +1712,8 @@ static void getSubSampleFactors(int *h, int *v, int format){
                 break;
         case IMGFMT_YV12:
         case IMGFMT_Y800: //FIXME remove after different subsamplings are fully implemented
+       case IMGFMT_NV12:
+       case IMGFMT_NV21:
                 *h=1;
                 *v=1;
                 break;
@@ -1708,7 +1795,10 @@ int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange
  
         yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
         //FIXME factorize
-       
+
+#ifdef HAVE_ALTIVEC
+       yuv2rgb_altivec_init_tables (c, inv_table, brightness, contrast, saturation);
+#endif 
         return 0;
  }
  
@@ -1730,7 +1820,7 @@ int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int
  }
  
  SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int dstH, int origDstFormat, int flags,
-                         SwsFilter *srcFilter, SwsFilter *dstFilter){
+                         SwsFilter *srcFilter, SwsFilter *dstFilter, double *param){
  
         SwsContext *c;
         int i;
@@ -1738,7 +1828,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
         int unscaled, needsDither;
         int srcFormat, dstFormat;
         SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
         if(flags & SWS_CPU_CAPS_MMX)
                 asm volatile("emms\n\t"::: "memory");
  #endif
@@ -1829,6 +1919,14 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
         if((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP)) 
                 c->chrSrcHSubSample=1;
  
+       if(param){
+               c->param[0] = param[0];
+               c->param[1] = param[1];
+       }else{
+               c->param[0] =
+               c->param[1] = SWS_PARAM_DEFAULT;
+       }
+
         c->chrIntHSubSample= c->chrDstHSubSample;
         c->chrIntVSubSample= c->chrSrcVSubSample;
  
@@ -1844,7 +1942,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
         if(unscaled && !usesHFilter && !usesVFilter)
         {
                 /* yv12_to_nv12 */
-               if(srcFormat == IMGFMT_YV12 && dstFormat == IMGFMT_NV12)
+               if(srcFormat == IMGFMT_YV12 && (dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21))
                 {
                         c->swScale= PlanarToNV12Wrapper;
                 }
@@ -1888,6 +1986,18 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
                         }
                 }
  
+#ifdef HAVE_ALTIVEC
+               if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
+                   ((srcFormat == IMGFMT_YV12 && 
+                     (dstFormat == IMGFMT_YUY2 || dstFormat == IMGFMT_UYVY)))) {
+                 // unscaled YV12 -> packed YUV, we want speed
+                 if (dstFormat == IMGFMT_YUY2)
+                   c->swScale= yv12toyuy2_unscaled_altivec;
+                 else
+                   c->swScale= yv12touyvy_unscaled_altivec;
+               }
+#endif
+
                 /* simple copy */
                 if(   srcFormat == dstFormat
                    || (isPlanarYUV(srcFormat) && isGray(dstFormat))
@@ -1944,21 +2054,33 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
  
         /* precalculate horizontal scaler filter coefficients */
         {
-               const int filterAlign= (flags & SWS_CPU_CAPS_MMX) ? 4 : 1;
+               const int filterAlign=
+                 (flags & SWS_CPU_CAPS_MMX) ? 4 :
+                 (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
+                 1;
  
                 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
                                  srcW      ,       dstW, filterAlign, 1<<14,
                                  (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
-                                srcFilter->lumH, dstFilter->lumH);
+                                srcFilter->lumH, dstFilter->lumH, c->param);
                 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
                                  c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
                                  (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
-                                srcFilter->chrH, dstFilter->chrH);
+                                srcFilter->chrH, dstFilter->chrH, c->param);
  
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
  // can't downscale !!!
                 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
                 {
+#define MAX_FUNNY_CODE_SIZE 10000
+#ifdef MAP_ANONYMOUS
+                       c->funnyYCode = (uint8_t*)mmap(NULL, MAX_FUNNY_CODE_SIZE, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+                       c->funnyUVCode = (uint8_t*)mmap(NULL, MAX_FUNNY_CODE_SIZE, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+#else
+                       c->funnyYCode = (uint8_t*)memalign(32, MAX_FUNNY_CODE_SIZE);
+                       c->funnyUVCode = (uint8_t*)memalign(32, MAX_FUNNY_CODE_SIZE);
+#endif
+
                         c->lumMmx2Filter   = (int16_t*)memalign(8, (dstW        /8+8)*sizeof(int16_t));
                         c->chrMmx2Filter   = (int16_t*)memalign(8, (c->chrDstW  /4+8)*sizeof(int16_t));
                         c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW      /2/8+8)*sizeof(int32_t));
@@ -1973,14 +2095,20 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
  
  
         /* precalculate vertical scaler filter coefficients */
-       initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
-                       srcH      ,        dstH, 1, (1<<12)-4,
-                       (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
-                       srcFilter->lumV, dstFilter->lumV);
-       initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
-                       c->chrSrcH, c->chrDstH, 1, (1<<12)-4,
-                       (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
-                       srcFilter->chrV, dstFilter->chrV);
+       {
+               const int filterAlign=
+                 (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
+                 1;
+
+               initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
+                               srcH      ,        dstH, filterAlign, (1<<12)-4,
+                               (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
+                               srcFilter->lumV, dstFilter->lumV, c->param);
+               initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
+                               c->chrSrcH, c->chrDstH, filterAlign, (1<<12)-4,
+                               (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
+                               srcFilter->chrV, dstFilter->chrV, c->param);
+       }
  
         // Calculate Buffer Sizes so that they won't run out while handling these damn slices
         c->vLumBufSize= c->vLumFilterSize;
@@ -2090,7 +2218,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int origSrcFormat, int dstW, int
                 }
                 else
                 {
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
                         MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
  #else
                         if(flags & SWS_FAST_BILINEAR)
@@ -2510,6 +2638,18 @@ void sws_freeContext(SwsContext *c){
         if(c->hChrFilterPos) free(c->hChrFilterPos);
         c->hChrFilterPos = NULL;
  
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#ifdef MAP_ANONYMOUS
+       if(c->funnyYCode) munmap(c->funnyYCode, MAX_FUNNY_CODE_SIZE);
+       if(c->funnyUVCode) munmap(c->funnyUVCode, MAX_FUNNY_CODE_SIZE);
+#else
+       if(c->funnyYCode) free(c->funnyYCode);
+       if(c->funnyUVCode) free(c->funnyUVCode);
+#endif
+       c->funnyYCode=NULL;
+       c->funnyUVCode=NULL;
+#endif
+
         if(c->lumMmx2Filter) free(c->lumMmx2Filter);
         c->lumMmx2Filter=NULL;
         if(c->chrMmx2Filter) free(c->chrMmx2Filter);