]> git.sesse.net Git - ffmpeg/blobdiff - postproc/swscale_template.c
divx 5.01 support
[ffmpeg] / postproc / swscale_template.c
index ba8a8c1fe127aed492498fbbc2771a2624f355d1..e76020eab7a48e0799e2afae012ac9805232ea98 100644 (file)
@@ -993,6 +993,9 @@ FULL_YSCALEYUV2RGB
                if(dstFormat==IMGFMT_BGR32)
                {
                        int i;
+#ifdef WORDS_BIGENDIAN
+                       dest++;
+#endif
                        for(i=0;i<dstW;i++){
                                // vertical linear interpolation && yuv2rgb in a single step:
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
@@ -1116,6 +1119,9 @@ FULL_YSCALEYUV2RGB
                if(dstFormat==IMGFMT_BGR32)
                {
                        int i;
+#ifdef WORDS_BIGENDIAN
+                       dest++;
+#endif
                        for(i=0; i<dstW-1; i+=2){
                                // vertical linear interpolation && yuv2rgb in a single step:
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
@@ -1394,6 +1400,9 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
        if(dstFormat==IMGFMT_BGR32)
        {
                int i;
+#ifdef WORDS_BIGENDIAN
+               dest++;
+#endif
                for(i=0; i<dstW-1; i+=2){
                        // vertical linear interpolation && yuv2rgb in a single step:
                        int Y1=yuvtab_2568[buf0[i]>>7];
@@ -1635,7 +1644,76 @@ static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1
 
 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
 {
-#ifdef HAVE_MMXFIXME
+#ifdef HAVE_MMX
+       asm volatile(
+               "movl %2, %%eax                 \n\t"
+               "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
+               "movq "MANGLE(w1111)", %%mm5            \n\t"
+               "pxor %%mm7, %%mm7              \n\t"
+               "leal (%%eax, %%eax, 2), %%ebx  \n\t"
+               ".balign 16                     \n\t"
+               "1:                             \n\t"
+               PREFETCH" 64(%0, %%ebx)         \n\t"
+               "movd (%0, %%ebx), %%mm0        \n\t"
+               "movd 3(%0, %%ebx), %%mm1       \n\t"
+               "punpcklbw %%mm7, %%mm0         \n\t"
+               "punpcklbw %%mm7, %%mm1         \n\t"
+               "movd 6(%0, %%ebx), %%mm2       \n\t"
+               "movd 9(%0, %%ebx), %%mm3       \n\t"
+               "punpcklbw %%mm7, %%mm2         \n\t"
+               "punpcklbw %%mm7, %%mm3         \n\t"
+               "pmaddwd %%mm6, %%mm0           \n\t"
+               "pmaddwd %%mm6, %%mm1           \n\t"
+               "pmaddwd %%mm6, %%mm2           \n\t"
+               "pmaddwd %%mm6, %%mm3           \n\t"
+#ifndef FAST_BGR2YV12
+               "psrad $8, %%mm0                \n\t"
+               "psrad $8, %%mm1                \n\t"
+               "psrad $8, %%mm2                \n\t"
+               "psrad $8, %%mm3                \n\t"
+#endif
+               "packssdw %%mm1, %%mm0          \n\t"
+               "packssdw %%mm3, %%mm2          \n\t"
+               "pmaddwd %%mm5, %%mm0           \n\t"
+               "pmaddwd %%mm5, %%mm2           \n\t"
+               "packssdw %%mm2, %%mm0          \n\t"
+               "psraw $7, %%mm0                \n\t"
+
+               "movd 12(%0, %%ebx), %%mm4      \n\t"
+               "movd 15(%0, %%ebx), %%mm1      \n\t"
+               "punpcklbw %%mm7, %%mm4         \n\t"
+               "punpcklbw %%mm7, %%mm1         \n\t"
+               "movd 18(%0, %%ebx), %%mm2      \n\t"
+               "movd 21(%0, %%ebx), %%mm3      \n\t"
+               "punpcklbw %%mm7, %%mm2         \n\t"
+               "punpcklbw %%mm7, %%mm3         \n\t"
+               "pmaddwd %%mm6, %%mm4           \n\t"
+               "pmaddwd %%mm6, %%mm1           \n\t"
+               "pmaddwd %%mm6, %%mm2           \n\t"
+               "pmaddwd %%mm6, %%mm3           \n\t"
+#ifndef FAST_BGR2YV12
+               "psrad $8, %%mm4                \n\t"
+               "psrad $8, %%mm1                \n\t"
+               "psrad $8, %%mm2                \n\t"
+               "psrad $8, %%mm3                \n\t"
+#endif
+               "packssdw %%mm1, %%mm4          \n\t"
+               "packssdw %%mm3, %%mm2          \n\t"
+               "pmaddwd %%mm5, %%mm4           \n\t"
+               "pmaddwd %%mm5, %%mm2           \n\t"
+               "addl $24, %%ebx                \n\t"
+               "packssdw %%mm2, %%mm4          \n\t"
+               "psraw $7, %%mm4                \n\t"
+
+               "packuswb %%mm4, %%mm0          \n\t"
+               "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
+
+               "movq %%mm0, (%1, %%eax)        \n\t"
+               "addl $8, %%eax                 \n\t"
+               " js 1b                         \n\t"
+               : : "r" (src+width*3), "r" (dst+width), "g" (-width)
+               : "%eax", "%ebx"
+       );
 #else
        int i;
        for(i=0; i<width; i++)
@@ -1651,7 +1729,156 @@ static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
 
 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
 {
-#ifdef HAVE_MMXFIXME
+#ifdef HAVE_MMX
+       asm volatile(
+               "movl %4, %%eax                 \n\t"
+               "movq "MANGLE(w1111)", %%mm5            \n\t"
+               "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
+               "pxor %%mm7, %%mm7              \n\t"
+               "leal (%%eax, %%eax, 2), %%ebx  \n\t"
+               "addl %%ebx, %%ebx              \n\t"
+               ".balign 16                     \n\t"
+               "1:                             \n\t"
+               PREFETCH" 64(%0, %%ebx)         \n\t"
+               PREFETCH" 64(%1, %%ebx)         \n\t"
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+               "movq (%0, %%ebx), %%mm0        \n\t"
+               "movq (%1, %%ebx), %%mm1        \n\t"
+               "movq 6(%0, %%ebx), %%mm2       \n\t"
+               "movq 6(%1, %%ebx), %%mm3       \n\t"
+               PAVGB(%%mm1, %%mm0)
+               PAVGB(%%mm3, %%mm2)
+               "movq %%mm0, %%mm1              \n\t"
+               "movq %%mm2, %%mm3              \n\t"
+               "psrlq $24, %%mm0               \n\t"
+               "psrlq $24, %%mm2               \n\t"
+               PAVGB(%%mm1, %%mm0)
+               PAVGB(%%mm3, %%mm2)
+               "punpcklbw %%mm7, %%mm0         \n\t"
+               "punpcklbw %%mm7, %%mm2         \n\t"
+#else
+               "movd (%0, %%ebx), %%mm0        \n\t"
+               "movd (%1, %%ebx), %%mm1        \n\t"
+               "movd 3(%0, %%ebx), %%mm2       \n\t"
+               "movd 3(%1, %%ebx), %%mm3       \n\t"
+               "punpcklbw %%mm7, %%mm0         \n\t"
+               "punpcklbw %%mm7, %%mm1         \n\t"
+               "punpcklbw %%mm7, %%mm2         \n\t"
+               "punpcklbw %%mm7, %%mm3         \n\t"
+               "paddw %%mm1, %%mm0             \n\t"
+               "paddw %%mm3, %%mm2             \n\t"
+               "paddw %%mm2, %%mm0             \n\t"
+               "movd 6(%0, %%ebx), %%mm4       \n\t"
+               "movd 6(%1, %%ebx), %%mm1       \n\t"
+               "movd 9(%0, %%ebx), %%mm2       \n\t"
+               "movd 9(%1, %%ebx), %%mm3       \n\t"
+               "punpcklbw %%mm7, %%mm4         \n\t"
+               "punpcklbw %%mm7, %%mm1         \n\t"
+               "punpcklbw %%mm7, %%mm2         \n\t"
+               "punpcklbw %%mm7, %%mm3         \n\t"
+               "paddw %%mm1, %%mm4             \n\t"
+               "paddw %%mm3, %%mm2             \n\t"
+               "paddw %%mm4, %%mm2             \n\t"
+               "psrlw $2, %%mm0                \n\t"
+               "psrlw $2, %%mm2                \n\t"
+#endif
+               "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
+               "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
+               
+               "pmaddwd %%mm0, %%mm1           \n\t"
+               "pmaddwd %%mm2, %%mm3           \n\t"
+               "pmaddwd %%mm6, %%mm0           \n\t"
+               "pmaddwd %%mm6, %%mm2           \n\t"
+#ifndef FAST_BGR2YV12
+               "psrad $8, %%mm0                \n\t"
+               "psrad $8, %%mm1                \n\t"
+               "psrad $8, %%mm2                \n\t"
+               "psrad $8, %%mm3                \n\t"
+#endif
+               "packssdw %%mm2, %%mm0          \n\t"
+               "packssdw %%mm3, %%mm1          \n\t"
+               "pmaddwd %%mm5, %%mm0           \n\t"
+               "pmaddwd %%mm5, %%mm1           \n\t"
+               "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
+               "psraw $7, %%mm0                \n\t"
+
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+               "movq 12(%0, %%ebx), %%mm4      \n\t"
+               "movq 12(%1, %%ebx), %%mm1      \n\t"
+               "movq 18(%0, %%ebx), %%mm2      \n\t"
+               "movq 18(%1, %%ebx), %%mm3      \n\t"
+               PAVGB(%%mm1, %%mm4)
+               PAVGB(%%mm3, %%mm2)
+               "movq %%mm4, %%mm1              \n\t"
+               "movq %%mm2, %%mm3              \n\t"
+               "psrlq $24, %%mm4               \n\t"
+               "psrlq $24, %%mm2               \n\t"
+               PAVGB(%%mm1, %%mm4)
+               PAVGB(%%mm3, %%mm2)
+               "punpcklbw %%mm7, %%mm4         \n\t"
+               "punpcklbw %%mm7, %%mm2         \n\t"
+#else
+               "movd 12(%0, %%ebx), %%mm4      \n\t"
+               "movd 12(%1, %%ebx), %%mm1      \n\t"
+               "movd 15(%0, %%ebx), %%mm2      \n\t"
+               "movd 15(%1, %%ebx), %%mm3      \n\t"
+               "punpcklbw %%mm7, %%mm4         \n\t"
+               "punpcklbw %%mm7, %%mm1         \n\t"
+               "punpcklbw %%mm7, %%mm2         \n\t"
+               "punpcklbw %%mm7, %%mm3         \n\t"
+               "paddw %%mm1, %%mm4             \n\t"
+               "paddw %%mm3, %%mm2             \n\t"
+               "paddw %%mm2, %%mm4             \n\t"
+               "movd 18(%0, %%ebx), %%mm5      \n\t"
+               "movd 18(%1, %%ebx), %%mm1      \n\t"
+               "movd 21(%0, %%ebx), %%mm2      \n\t"
+               "movd 21(%1, %%ebx), %%mm3      \n\t"
+               "punpcklbw %%mm7, %%mm5         \n\t"
+               "punpcklbw %%mm7, %%mm1         \n\t"
+               "punpcklbw %%mm7, %%mm2         \n\t"
+               "punpcklbw %%mm7, %%mm3         \n\t"
+               "paddw %%mm1, %%mm5             \n\t"
+               "paddw %%mm3, %%mm2             \n\t"
+               "paddw %%mm5, %%mm2             \n\t"
+               "movq "MANGLE(w1111)", %%mm5            \n\t"
+               "psrlw $2, %%mm4                \n\t"
+               "psrlw $2, %%mm2                \n\t"
+#endif
+               "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
+               "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
+               
+               "pmaddwd %%mm4, %%mm1           \n\t"
+               "pmaddwd %%mm2, %%mm3           \n\t"
+               "pmaddwd %%mm6, %%mm4           \n\t"
+               "pmaddwd %%mm6, %%mm2           \n\t"
+#ifndef FAST_BGR2YV12
+               "psrad $8, %%mm4                \n\t"
+               "psrad $8, %%mm1                \n\t"
+               "psrad $8, %%mm2                \n\t"
+               "psrad $8, %%mm3                \n\t"
+#endif
+               "packssdw %%mm2, %%mm4          \n\t"
+               "packssdw %%mm3, %%mm1          \n\t"
+               "pmaddwd %%mm5, %%mm4           \n\t"
+               "pmaddwd %%mm5, %%mm1           \n\t"
+               "addl $24, %%ebx                \n\t"
+               "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
+               "psraw $7, %%mm4                \n\t"
+               
+               "movq %%mm0, %%mm1              \n\t"
+               "punpckldq %%mm4, %%mm0         \n\t"
+               "punpckhdq %%mm4, %%mm1         \n\t"
+               "packsswb %%mm1, %%mm0          \n\t"
+               "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
+
+               "movd %%mm0, (%2, %%eax)        \n\t"
+               "punpckhdq %%mm0, %%mm0         \n\t"
+               "movd %%mm0, (%3, %%eax)        \n\t"
+               "addl $4, %%eax                 \n\t"
+               " js 1b                         \n\t"
+               : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
+               : "%eax", "%ebx"
+       );
 #else
        int i;
        for(i=0; i<width; i++)
@@ -1666,6 +1893,185 @@ static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1
 #endif
 }
 
+static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+       int i;
+       for(i=0; i<width; i++)
+       {
+               int d= src[i*2] + (src[i*2+1]<<8);
+               int b= d&0x1F;
+               int g= (d>>5)&0x3F;
+               int r= (d>>11)&0x1F;
+
+               dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
+       }
+}
+
+static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+       int i;
+       for(i=0; i<width; i++)
+       {
+#if 1
+               int d0= le2me_32( ((uint32_t*)src1)[i] );
+               int d1= le2me_32( ((uint32_t*)src2)[i] );
+               
+               int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
+               int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
+
+               int dh2= (dh>>11) + (dh<<21);
+               int d= dh2 + dl;
+
+               int b= d&0x7F;
+               int r= (d>>11)&0x7F;
+               int g= d>>21;
+#else
+               int d0= src1[i*4] + (src1[i*4+1]<<8);
+               int b0= d0&0x1F;
+               int g0= (d0>>5)&0x3F;
+               int r0= (d0>>11)&0x1F;
+
+               int d1= src1[i*4+2] + (src1[i*4+3]<<8);
+               int b1= d1&0x1F;
+               int g1= (d1>>5)&0x3F;
+               int r1= (d1>>11)&0x1F;
+
+               int d2= src2[i*4] + (src2[i*4+1]<<8);
+               int b2= d2&0x1F;
+               int g2= (d2>>5)&0x3F;
+               int r2= (d2>>11)&0x1F;
+
+               int d3= src2[i*4+2] + (src2[i*4+3]<<8);
+               int b3= d3&0x1F;
+               int g3= (d3>>5)&0x3F;
+               int r3= (d3>>11)&0x1F;
+
+               int b= b0 + b1 + b2 + b3;
+               int g= g0 + g1 + g2 + g3;
+               int r= r0 + r1 + r2 + r3;
+#endif
+               dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
+               dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
+       }
+}
+
+static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+       int i;
+       for(i=0; i<width; i++)
+       {
+               int d= src[i*2] + (src[i*2+1]<<8);
+               int b= d&0x1F;
+               int g= (d>>5)&0x1F;
+               int r= (d>>10)&0x1F;
+
+               dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
+       }
+}
+
+static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+       int i;
+       for(i=0; i<width; i++)
+       {
+#if 1
+               int d0= le2me_32( ((uint32_t*)src1)[i] );
+               int d1= le2me_32( ((uint32_t*)src2)[i] );
+               
+               int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
+               int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
+
+               int dh2= (dh>>11) + (dh<<21);
+               int d= dh2 + dl;
+
+               int b= d&0x7F;
+               int r= (d>>10)&0x7F;
+               int g= d>>21;
+#else
+               int d0= src1[i*4] + (src1[i*4+1]<<8);
+               int b0= d0&0x1F;
+               int g0= (d0>>5)&0x1F;
+               int r0= (d0>>10)&0x1F;
+
+               int d1= src1[i*4+2] + (src1[i*4+3]<<8);
+               int b1= d1&0x1F;
+               int g1= (d1>>5)&0x1F;
+               int r1= (d1>>10)&0x1F;
+
+               int d2= src2[i*4] + (src2[i*4+1]<<8);
+               int b2= d2&0x1F;
+               int g2= (d2>>5)&0x1F;
+               int r2= (d2>>10)&0x1F;
+
+               int d3= src2[i*4+2] + (src2[i*4+3]<<8);
+               int b3= d3&0x1F;
+               int g3= (d3>>5)&0x1F;
+               int r3= (d3>>10)&0x1F;
+
+               int b= b0 + b1 + b2 + b3;
+               int g= g0 + g1 + g2 + g3;
+               int r= r0 + r1 + r2 + r3;
+#endif
+               dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
+               dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
+       }
+}
+
+
+static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+       int i;
+       for(i=0; i<width; i++)
+       {
+               int r= src[i*4+0];
+               int g= src[i*4+1];
+               int b= src[i*4+2];
+
+               dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
+       }
+}
+
+static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+       int i;
+       for(i=0; i<width; i++)
+       {
+               int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
+               int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
+               int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
+
+               dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
+               dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
+       }
+}
+
+static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+       int i;
+       for(i=0; i<width; i++)
+       {
+               int r= src[i*3+0];
+               int g= src[i*3+1];
+               int b= src[i*3+2];
+
+               dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
+       }
+}
+
+static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+       int i;
+       for(i=0; i<width; i++)
+       {
+               int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
+               int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
+               int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
+
+               dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
+               dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
+       }
+}
+
 
 // Bilinear / Bicubic scaling
 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
@@ -1832,7 +2238,8 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
-                                  int srcFormat, uint8_t *formatConvBuffer)
+                                  int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+                                  int32_t *mmx2FilterPos)
 {
     if(srcFormat==IMGFMT_YUY2)
     {
@@ -1849,6 +2256,26 @@ static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, in
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
        src= formatConvBuffer;
     }
+    else if(srcFormat==IMGFMT_BGR16)
+    {
+       RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
+       src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_BGR15)
+    {
+       RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
+       src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_RGB32)
+    {
+       RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
+       src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_RGB24)
+    {
+       RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
+       src= formatConvBuffer;
+    }
 
 #ifdef HAVE_MMX
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
@@ -1868,35 +2295,21 @@ static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, in
        {
                asm volatile(
                        "pxor %%mm7, %%mm7              \n\t"
-                       "pxor %%mm2, %%mm2              \n\t" // 2*xalpha
-                       "movd %5, %%mm6                 \n\t" // xInc&0xFFFF
-                       "punpcklwd %%mm6, %%mm6         \n\t"
-                       "punpcklwd %%mm6, %%mm6         \n\t"
-                       "movq %%mm6, %%mm2              \n\t"
-                       "psllq $16, %%mm2               \n\t"
-                       "paddw %%mm6, %%mm2             \n\t"
-                       "psllq $16, %%mm2               \n\t"
-                       "paddw %%mm6, %%mm2             \n\t"
-                       "psllq $16, %%mm2               \n\t" //0,t,2t,3t               t=xInc&0xFF
-                       "movq %%mm2, %%mm4              \n\t"
-                       "movd %4, %%mm6                 \n\t" //(xInc*4)&0xFFFF
-                       "punpcklwd %%mm6, %%mm6         \n\t"
-                       "punpcklwd %%mm6, %%mm6         \n\t"
+                       "movl %0, %%ecx                 \n\t"
+                       "movl %1, %%edi                 \n\t"
+                       "movl %2, %%edx                 \n\t"
+                       "movl %3, %%ebx                 \n\t"
                        "xorl %%eax, %%eax              \n\t" // i
-                       "movl %0, %%esi                 \n\t" // src
-                       "movl %1, %%edi                 \n\t" // buf1
-                       "movl %3, %%edx                 \n\t" // (xInc*4)>>16
-                       "xorl %%ecx, %%ecx              \n\t"
-                       "xorl %%ebx, %%ebx              \n\t"
-                       "movw %4, %%bx                  \n\t" // (xInc*4)&0xFFFF
+                       PREFETCH" (%%ecx)               \n\t"
+                       PREFETCH" 32(%%ecx)             \n\t"
+                       PREFETCH" 64(%%ecx)             \n\t"
 
 #define FUNNY_Y_CODE \
-                       PREFETCH" 1024(%%esi)           \n\t"\
-                       PREFETCH" 1056(%%esi)           \n\t"\
-                       PREFETCH" 1088(%%esi)           \n\t"\
-                       "call *%6                       \n\t"\
-                       "movq %%mm4, %%mm2              \n\t"\
-                       "xorl %%ecx, %%ecx              \n\t"
+                       "movl (%%ebx), %%esi            \n\t"\
+                       "call *%4                       \n\t"\
+                       "addl (%%ebx, %%eax), %%ecx     \n\t"\
+                       "addl %%eax, %%edi              \n\t"\
+                       "xorl %%eax, %%eax              \n\t"\
 
 FUNNY_Y_CODE
 FUNNY_Y_CODE
@@ -1907,8 +2320,8 @@ FUNNY_Y_CODE
 FUNNY_Y_CODE
 FUNNY_Y_CODE
 
-                       :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
-                       "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
+                       :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+                       "m" (funnyYCode)
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
                );
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
@@ -1976,7 +2389,8 @@ FUNNY_Y_CODE
 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
-                                  int srcFormat, uint8_t *formatConvBuffer)
+                                  int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+                                  int32_t *mmx2FilterPos)
 {
     if(srcFormat==IMGFMT_YUY2)
     {
@@ -1996,6 +2410,30 @@ inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, u
        src1= formatConvBuffer;
        src2= formatConvBuffer+2048;
     }
+    else if(srcFormat==IMGFMT_BGR16)
+    {
+       RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+       src1= formatConvBuffer;
+       src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_BGR15)
+    {
+       RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+       src1= formatConvBuffer;
+       src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_RGB32)
+    {
+       RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+       src1= formatConvBuffer;
+       src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_RGB24)
+    {
+       RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+       src1= formatConvBuffer;
+       src2= formatConvBuffer+2048;
+    }
     else if(isGray(srcFormat))
     {
        return;
@@ -2019,65 +2457,44 @@ inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, u
        if(canMMX2BeUsed)
        {
                asm volatile(
-               "pxor %%mm7, %%mm7              \n\t"
-               "pxor %%mm2, %%mm2              \n\t" // 2*xalpha
-               "movd %5, %%mm6                 \n\t" // xInc&0xFFFF
-               "punpcklwd %%mm6, %%mm6         \n\t"
-               "punpcklwd %%mm6, %%mm6         \n\t"
-               "movq %%mm6, %%mm2              \n\t"
-               "psllq $16, %%mm2               \n\t"
-               "paddw %%mm6, %%mm2             \n\t"
-               "psllq $16, %%mm2               \n\t"
-               "paddw %%mm6, %%mm2             \n\t"
-               "psllq $16, %%mm2               \n\t" //0,t,2t,3t               t=xInc&0xFFFF
-               "movq %%mm2, %%mm4              \n\t"
-               "movd %4, %%mm6                 \n\t" //(xInc*4)&0xFFFF
-               "punpcklwd %%mm6, %%mm6         \n\t"
-               "punpcklwd %%mm6, %%mm6         \n\t"
-               "xorl %%eax, %%eax              \n\t" // i
-               "movl %0, %%esi                 \n\t" // src
-               "movl %1, %%edi                 \n\t" // buf1
-               "movl %3, %%edx                 \n\t" // (xInc*4)>>16
-               "xorl %%ecx, %%ecx              \n\t"
-               "xorl %%ebx, %%ebx              \n\t"
-               "movw %4, %%bx                  \n\t" // (xInc*4)&0xFFFF
-
-#define FUNNYUVCODE \
-                       PREFETCH" 1024(%%esi)           \n\t"\
-                       PREFETCH" 1056(%%esi)           \n\t"\
-                       PREFETCH" 1088(%%esi)           \n\t"\
-                       "call *%7                       \n\t"\
-                       "movq %%mm4, %%mm2      \n\t"\
-                       "xorl %%ecx, %%ecx              \n\t"
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-               "xorl %%eax, %%eax              \n\t" // i
-               "movl %6, %%esi                 \n\t" // src
-               "movl %1, %%edi                 \n\t" // buf1
-               "addl $4096, %%edi              \n\t"
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-FUNNYUVCODE
-
-               :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
-                 "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
-               : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
-       );
+                       "pxor %%mm7, %%mm7              \n\t"
+                       "movl %0, %%ecx                 \n\t"
+                       "movl %1, %%edi                 \n\t"
+                       "movl %2, %%edx                 \n\t"
+                       "movl %3, %%ebx                 \n\t"
+                       "xorl %%eax, %%eax              \n\t" // i
+                       PREFETCH" (%%ecx)               \n\t"
+                       PREFETCH" 32(%%ecx)             \n\t"
+                       PREFETCH" 64(%%ecx)             \n\t"
+
+#define FUNNY_UV_CODE \
+                       "movl (%%ebx), %%esi            \n\t"\
+                       "call *%4                       \n\t"\
+                       "addl (%%ebx, %%eax), %%ecx     \n\t"\
+                       "addl %%eax, %%edi              \n\t"\
+                       "xorl %%eax, %%eax              \n\t"\
+
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+                       "xorl %%eax, %%eax              \n\t" // i
+                       "movl %5, %%ecx                 \n\t" // src
+                       "movl %1, %%edi                 \n\t" // buf1
+                       "addl $4096, %%edi              \n\t"
+                       PREFETCH" (%%ecx)               \n\t"
+                       PREFETCH" 32(%%ecx)             \n\t"
+                       PREFETCH" 64(%%ecx)             \n\t"
+
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+
+                       :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+                       "m" (funnyUVCode), "m" (src2)
+                       : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
+               );
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
                {
 //                     printf("%d %d %d\n", dstWidth, i, srcW);
@@ -2148,7 +2565,7 @@ FUNNYUVCODE
 }
 
 static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
-             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
+             int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
 
        /* load a few things into local vars to make the code more readable? and faster */
        const int srcW= c->srcW;
@@ -2189,10 +2606,11 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
        int lastInLumBuf= c->lastInLumBuf;
        int lastInChrBuf= c->lastInChrBuf;
        int srcStride[3];
+       int dstStride[3];
        uint8_t *src[3];
        uint8_t *dst[3];
        
-       if((c->srcFormat == IMGFMT_IYUV) || (c->srcFormat == IMGFMT_I420)){
+       if(c->srcFormat == IMGFMT_I420){
                src[0]= srcParam[0];
                src[1]= srcParam[2];
                src[2]= srcParam[1];
@@ -2225,17 +2643,24 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
                srcStride[2]= 0;
        }
 
-       if((c->dstFormat == IMGFMT_IYUV) || (c->dstFormat == IMGFMT_I420)){
+       if(dstFormat == IMGFMT_I420){
                dst[0]= dstParam[0];
                dst[1]= dstParam[2];
                dst[2]= dstParam[1];
-               
+               dstStride[0]= dstStrideParam[0];
+               dstStride[1]= dstStrideParam[2];
+               dstStride[2]= dstStrideParam[1];
        }else{
                dst[0]= dstParam[0];
                dst[1]= dstParam[1];
                dst[2]= dstParam[2];
+               dstStride[0]= dstStrideParam[0];
+               dstStride[1]= dstStrideParam[1];
+               dstStride[2]= dstStrideParam[2];
        }
-       
+
+//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
+//dstStride[0],dstStride[1],dstStride[2]);
 
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
        {
@@ -2277,7 +2702,7 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
 
                // Do we have enough lines in this slice to output the dstY line
-               if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
+               if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH + 1)>>1))
                {
                        //Do horizontal scaling
                        while(lastInLumBuf < lastLumSrcY)
@@ -2291,7 +2716,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
 //                             printf("%d %d\n", lumBufIndex, vLumBufSize);
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
-                                               funnyYCode, c->srcFormat, formatConvBuffer);
+                                               funnyYCode, c->srcFormat, formatConvBuffer, 
+                                               c->lumMmx2Filter, c->lumMmx2FilterPos);
                                lastInLumBuf++;
                        }
                        while(lastInChrBuf < lastChrSrcY)
@@ -2300,12 +2726,13 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
                                chrBufIndex++;
                                ASSERT(chrBufIndex < 2*vChrBufSize)
-                               ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
+                               ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
                                //FIXME replace parameters through context struct (some at least)
                                RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
-                                               funnyUVCode, c->srcFormat, formatConvBuffer);
+                                               funnyUVCode, c->srcFormat, formatConvBuffer, 
+                                               c->chrMmx2Filter, c->chrMmx2FilterPos);
                                lastInChrBuf++;
                        }
                        //wrap buf index around to stay inside the ring buffer
@@ -2329,7 +2756,8 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
-                                               funnyYCode, c->srcFormat, formatConvBuffer);
+                                               funnyYCode, c->srcFormat, formatConvBuffer, 
+                                               c->lumMmx2Filter, c->lumMmx2FilterPos);
                                lastInLumBuf++;
                        }
                        while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
@@ -2338,11 +2766,12 @@ static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStridePar
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
                                chrBufIndex++;
                                ASSERT(chrBufIndex < 2*vChrBufSize)
-                               ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
+                               ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
                                RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
-                                               funnyUVCode, c->srcFormat, formatConvBuffer);
+                                               funnyUVCode, c->srcFormat, formatConvBuffer, 
+                                               c->chrMmx2Filter, c->chrMmx2FilterPos);
                                lastInChrBuf++;
                        }
                        //wrap buf index around to stay inside the ring buffer