]> git.sesse.net Git - ffmpeg/commitdiff
print more info if -v
authorMichael Niedermayer <michaelni@gmx.at>
Thu, 6 Dec 2001 19:07:25 +0000 (19:07 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Thu, 6 Dec 2001 19:07:25 +0000 (19:07 +0000)
use new horizontal mmx scaler instead of old x86asm if mmx2 cant be used (FAST_BILINEAR only)
fixed overflow in init function ... using double precission fp now :)
using C scaler for the last 1-2 lines if there is a chance to write over the end of the dst array

Originally committed as revision 3353 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/swscale.c
postproc/swscale_template.c

index 138f35891a83216bc68b60c32e95b33c7af611ff..d81fa6b37f321775a785021c817e1bddac61088d 100644 (file)
@@ -31,14 +31,14 @@ int allwaysIpol=0;
 //#define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
 #define ASSERT(x) ;
 
-
+extern int verbose; // defined in mplayer.c
 /*
 NOTES
 
 known BUGS with known cause (no bugreports please!, but patches are welcome :) )
-horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
+horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
 
-Supported output formats BGR15 BGR16 BGR24 BGR32, YV12
+Supported output formats BGR15 BGR16 BGR24 BGR32 YV12
 BGR15 & BGR16 MMX verions support dithering
 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
 
@@ -49,6 +49,7 @@ change the distance of the u & v buffer
 Move static / global vars into a struct so multiple scalers can be used
 write special vertical cubic upscale version
 Optimize C code (yv12 / minmax)
+dstStride[3]
 */
 
 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
@@ -183,6 +184,203 @@ void in_asm_used_var_warning_killer()
 }
 #endif
 
+static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
+                                   int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+                                   uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
+{
+       //FIXME Optimize (just quickly writen not opti..)
+       int i;
+       for(i=0; i<dstW; i++)
+       {
+               int val=0;
+               int j;
+               for(j=0; j<lumFilterSize; j++)
+                       val += lumSrc[j][i] * lumFilter[j];
+
+               dest[i]= MIN(MAX(val>>19, 0), 255);
+       }
+
+       if(uDest != NULL)
+               for(i=0; i<(dstW>>1); i++)
+               {
+                       int u=0;
+                       int v=0;
+                       int j;
+                       for(j=0; j<lumFilterSize; j++)
+                       {
+                               u += chrSrc[j][i] * chrFilter[j];
+                               v += chrSrc[j][i + 2048] * chrFilter[j];
+                       }
+
+                       uDest[i]= MIN(MAX(u>>19, 0), 255);
+                       vDest[i]= MIN(MAX(v>>19, 0), 255);
+               }
+}
+
+static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
+                                   int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+                                   uint8_t *dest, int dstW, int dstbpp)
+{
+       if(dstbpp==32)
+       {
+               int i;
+               for(i=0; i<(dstW>>1); i++){
+                       int j;
+                       int Y1=0;
+                       int Y2=0;
+                       int U=0;
+                       int V=0;
+                       int Cb, Cr, Cg;
+                       for(j=0; j<lumFilterSize; j++)
+                       {
+                               Y1 += lumSrc[j][2*i] * lumFilter[j];
+                               Y2 += lumSrc[j][2*i+1] * lumFilter[j];
+                       }
+                       for(j=0; j<chrFilterSize; j++)
+                       {
+                               U += chrSrc[j][i] * chrFilter[j];
+                               V += chrSrc[j][i+2048] * chrFilter[j];
+                       }
+                       Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
+                       Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
+                       U >>= 19;
+                       V >>= 19;
+
+                       Cb= clip_yuvtab_40cf[U+ 256];
+                       Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
+                       Cr= clip_yuvtab_3343[V+ 256];
+
+                       dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
+                       dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
+                       dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
+
+                       dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
+                       dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
+                       dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
+               }
+       }
+       else if(dstbpp==24)
+       {
+               int i;
+               for(i=0; i<(dstW>>1); i++){
+                       int j;
+                       int Y1=0;
+                       int Y2=0;
+                       int U=0;
+                       int V=0;
+                       int Cb, Cr, Cg;
+                       for(j=0; j<lumFilterSize; j++)
+                       {
+                               Y1 += lumSrc[j][2*i] * lumFilter[j];
+                               Y2 += lumSrc[j][2*i+1] * lumFilter[j];
+                       }
+                       for(j=0; j<chrFilterSize; j++)
+                       {
+                               U += chrSrc[j][i] * chrFilter[j];
+                               V += chrSrc[j][i+2048] * chrFilter[j];
+                       }
+                       Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
+                       Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
+                       U >>= 19;
+                       V >>= 19;
+
+                       Cb= clip_yuvtab_40cf[U+ 256];
+                       Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
+                       Cr= clip_yuvtab_3343[V+ 256];
+
+                       dest[0]=clip_table[((Y1 + Cb) >>13)];
+                       dest[1]=clip_table[((Y1 + Cg) >>13)];
+                       dest[2]=clip_table[((Y1 + Cr) >>13)];
+
+                       dest[3]=clip_table[((Y2 + Cb) >>13)];
+                       dest[4]=clip_table[((Y2 + Cg) >>13)];
+                       dest[5]=clip_table[((Y2 + Cr) >>13)];
+                       dest+=6;
+               }
+       }
+       else if(dstbpp==16)
+       {
+               int i;
+               for(i=0; i<(dstW>>1); i++){
+                       int j;
+                       int Y1=0;
+                       int Y2=0;
+                       int U=0;
+                       int V=0;
+                       int Cb, Cr, Cg;
+                       for(j=0; j<lumFilterSize; j++)
+                       {
+                               Y1 += lumSrc[j][2*i] * lumFilter[j];
+                               Y2 += lumSrc[j][2*i+1] * lumFilter[j];
+                       }
+                       for(j=0; j<chrFilterSize; j++)
+                       {
+                               U += chrSrc[j][i] * chrFilter[j];
+                               V += chrSrc[j][i+2048] * chrFilter[j];
+                       }
+                       Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
+                       Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
+                       U >>= 19;
+                       V >>= 19;
+
+                       Cb= clip_yuvtab_40cf[U+ 256];
+                       Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
+                       Cr= clip_yuvtab_3343[V+ 256];
+
+                       ((uint16_t*)dest)[2*i] =
+                               clip_table16b[(Y1 + Cb) >>13] |
+                               clip_table16g[(Y1 + Cg) >>13] |
+                               clip_table16r[(Y1 + Cr) >>13];
+
+                       ((uint16_t*)dest)[2*i+1] =
+                               clip_table16b[(Y2 + Cb) >>13] |
+                               clip_table16g[(Y2 + Cg) >>13] |
+                               clip_table16r[(Y2 + Cr) >>13];
+               }
+       }
+       else if(dstbpp==15)
+       {
+               int i;
+               for(i=0; i<(dstW>>1); i++){
+                       int j;
+                       int Y1=0;
+                       int Y2=0;
+                       int U=0;
+                       int V=0;
+                       int Cb, Cr, Cg;
+                       for(j=0; j<lumFilterSize; j++)
+                       {
+                               Y1 += lumSrc[j][2*i] * lumFilter[j];
+                               Y2 += lumSrc[j][2*i+1] * lumFilter[j];
+                       }
+                       for(j=0; j<chrFilterSize; j++)
+                       {
+                               U += chrSrc[j][i] * chrFilter[j];
+                               V += chrSrc[j][i+2048] * chrFilter[j];
+                       }
+                       Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
+                       Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
+                       U >>= 19;
+                       V >>= 19;
+
+                       Cb= clip_yuvtab_40cf[U+ 256];
+                       Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
+                       Cr= clip_yuvtab_3343[V+ 256];
+
+                       ((uint16_t*)dest)[2*i] =
+                               clip_table15b[(Y1 + Cb) >>13] |
+                               clip_table15g[(Y1 + Cg) >>13] |
+                               clip_table15r[(Y1 + Cr) >>13];
+
+                       ((uint16_t*)dest)[2*i+1] =
+                               clip_table15b[(Y2 + Cb) >>13] |
+                               clip_table15g[(Y2 + Cg) >>13] |
+                               clip_table15r[(Y2 + Cr) >>13];
+               }
+       }
+}
+
+
 //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
 //Plain C versions
 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
@@ -270,7 +468,6 @@ void in_asm_used_var_warning_killer()
 
 // *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
 // *** Note: it's called multiple times while decoding a frame, first time y==0
-// *** Designed to upscale, but may work for downscale too.
 // switching the cpu type during a sliced drawing can have bad effects, like sig11
 void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int srcSliceY ,
                             int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
index 6191d672f0fbf26596097f46303ae3784b9354ed..17102370c90cd355a3a06a8d8ee99dd1cd76f35a 100644 (file)
@@ -672,33 +672,9 @@ static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lu
                        : "%eax", "%edx", "%esi"
                );
 #else
-       //FIXME Optimize (just quickly writen not opti..)
-       int i;
-       for(i=0; i<dstW; i++)
-       {
-               int val=0;
-               int j;
-               for(j=0; j<lumFilterSize; j++)
-                       val += lumSrc[j][i] * lumFilter[j];
-
-               dest[i]= MIN(MAX(val>>19, 0), 255);
-       }
-
-       if(uDest != NULL)
-               for(i=0; i<(dstW>>1); i++)
-               {
-                       int u=0;
-                       int v=0;
-                       int j;
-                       for(j=0; j<lumFilterSize; j++)
-                       {
-                               u += chrSrc[j][i] * chrFilter[j];
-                               v += chrSrc[j][i + 2048] * chrFilter[j];
-                       }
-
-                       uDest[i]= MIN(MAX(u>>19, 0), 255);
-                       vDest[i]= MIN(MAX(v>>19, 0), 255);
-               }
+yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
+           chrFilter, chrSrc, chrFilterSize,
+           dest, uDest, vDest, dstW);
 #endif
 }
 
@@ -836,163 +812,10 @@ static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lu
                        );
                }
 #else
-               if(dstbpp==32)
-               {
-                       int i;
-                       for(i=0; i<(dstW>>1); i++){
-                               int j;
-                               int Y1=0;
-                               int Y2=0;
-                               int U=0;
-                               int V=0;
-                               int Cb, Cr, Cg;
-                               for(j=0; j<lumFilterSize; j++)
-                               {
-                                       Y1 += lumSrc[j][2*i] * lumFilter[j];
-                                       Y2 += lumSrc[j][2*i+1] * lumFilter[j];
-                               }
-                               for(j=0; j<chrFilterSize; j++)
-                               {
-                                       U += chrSrc[j][i] * chrFilter[j];
-                                       V += chrSrc[j][i+2048] * chrFilter[j];
-                               }
-                               Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
-                               Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
-                               U >>= 19;
-                               V >>= 19;
-
-                               Cb= clip_yuvtab_40cf[U+ 256];
-                               Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
-                               Cr= clip_yuvtab_3343[V+ 256];
-
-                               dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
-                               dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
-                               dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
-
-                               dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
-                               dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
-                               dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
-                       }
-               }
-               else if(dstbpp==24)
-               {
-                       int i;
-                       for(i=0; i<(dstW>>1); i++){
-                               int j;
-                               int Y1=0;
-                               int Y2=0;
-                               int U=0;
-                               int V=0;
-                               int Cb, Cr, Cg;
-                               for(j=0; j<lumFilterSize; j++)
-                               {
-                                       Y1 += lumSrc[j][2*i] * lumFilter[j];
-                                       Y2 += lumSrc[j][2*i+1] * lumFilter[j];
-                               }
-                               for(j=0; j<chrFilterSize; j++)
-                               {
-                                       U += chrSrc[j][i] * chrFilter[j];
-                                       V += chrSrc[j][i+2048] * chrFilter[j];
-                               }
-                               Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
-                               Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
-                               U >>= 19;
-                               V >>= 19;
+yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
+           chrFilter, chrSrc, chrFilterSize,
+           dest, dstW, dstbpp);
 
-                               Cb= clip_yuvtab_40cf[U+ 256];
-                               Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
-                               Cr= clip_yuvtab_3343[V+ 256];
-
-                               dest[0]=clip_table[((Y1 + Cb) >>13)];
-                               dest[1]=clip_table[((Y1 + Cg) >>13)];
-                               dest[2]=clip_table[((Y1 + Cr) >>13)];
-
-                               dest[3]=clip_table[((Y2 + Cb) >>13)];
-                               dest[4]=clip_table[((Y2 + Cg) >>13)];
-                               dest[5]=clip_table[((Y2 + Cr) >>13)];
-                               dest+=6;
-                       }
-               }
-               else if(dstbpp==16)
-               {
-                       int i;
-                       for(i=0; i<(dstW>>1); i++){
-                               int j;
-                               int Y1=0;
-                               int Y2=0;
-                               int U=0;
-                               int V=0;
-                               int Cb, Cr, Cg;
-                               for(j=0; j<lumFilterSize; j++)
-                               {
-                                       Y1 += lumSrc[j][2*i] * lumFilter[j];
-                                       Y2 += lumSrc[j][2*i+1] * lumFilter[j];
-                               }
-                               for(j=0; j<chrFilterSize; j++)
-                               {
-                                       U += chrSrc[j][i] * chrFilter[j];
-                                       V += chrSrc[j][i+2048] * chrFilter[j];
-                               }
-                               Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
-                               Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
-                               U >>= 19;
-                               V >>= 19;
-
-                               Cb= clip_yuvtab_40cf[U+ 256];
-                               Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
-                               Cr= clip_yuvtab_3343[V+ 256];
-
-                               ((uint16_t*)dest)[2*i] =
-                                       clip_table16b[(Y1 + Cb) >>13] |
-                                       clip_table16g[(Y1 + Cg) >>13] |
-                                       clip_table16r[(Y1 + Cr) >>13];
-
-                               ((uint16_t*)dest)[2*i+1] =
-                                       clip_table16b[(Y2 + Cb) >>13] |
-                                       clip_table16g[(Y2 + Cg) >>13] |
-                                       clip_table16r[(Y2 + Cr) >>13];
-                       }
-               }
-               else if(dstbpp==15)
-               {
-                       int i;
-                       for(i=0; i<(dstW>>1); i++){
-                               int j;
-                               int Y1=0;
-                               int Y2=0;
-                               int U=0;
-                               int V=0;
-                               int Cb, Cr, Cg;
-                               for(j=0; j<lumFilterSize; j++)
-                               {
-                                       Y1 += lumSrc[j][2*i] * lumFilter[j];
-                                       Y2 += lumSrc[j][2*i+1] * lumFilter[j];
-                               }
-                               for(j=0; j<chrFilterSize; j++)
-                               {
-                                       U += chrSrc[j][i] * chrFilter[j];
-                                       V += chrSrc[j][i+2048] * chrFilter[j];
-                               }
-                               Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
-                               Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
-                               U >>= 19;
-                               V >>= 19;
-
-                               Cb= clip_yuvtab_40cf[U+ 256];
-                               Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
-                               Cr= clip_yuvtab_3343[V+ 256];
-
-                               ((uint16_t*)dest)[2*i] =
-                                       clip_table15b[(Y1 + Cb) >>13] |
-                                       clip_table15g[(Y1 + Cg) >>13] |
-                                       clip_table15r[(Y1 + Cr) >>13];
-
-                               ((uint16_t*)dest)[2*i+1] =
-                                       clip_table15b[(Y2 + Cb) >>13] |
-                                       clip_table15g[(Y2 + Cg) >>13] |
-                                       clip_table15r[(Y2 + Cr) >>13];
-                       }
-               }
 #endif
        } //!FULL_UV_IPOL
 }
@@ -1373,7 +1196,6 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *
                            uint8_t *dest, int dstW, int uvalpha, int dstbpp)
 {
        int uvalpha1=uvalpha^4095;
-       const int yalpha=0;
        const int yalpha1=0;
 
        if(fullUVIpol || allwaysIpol)
@@ -1636,7 +1458,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
                        "movd %%mm0, (%4, %%ebp)        \n\t"
                        "addl $4, %%ebp                 \n\t"
                        " jnc 1b                        \n\t"
-                       
+
                        "popl %%ebp                     \n\t"
                        : "+a" (counter)
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
@@ -1764,7 +1586,12 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
       // *** horizontal scale Y line to temp buffer
 static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
 {
+#ifdef HAVE_MMX
+       // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
+    if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
+#else
     if(sws_flags != SWS_FAST_BILINEAR)
+#endif
     {
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
     }
@@ -1885,7 +1712,12 @@ FUNNY_Y_CODE
 inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
                                uint8_t *src1, uint8_t *src2, int srcW, int xInc)
 {
+#ifdef HAVE_MMX
+       // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
+    if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
+#else
     if(sws_flags != SWS_FAST_BILINEAR)
+#endif
     {
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
        RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
@@ -2026,12 +1858,13 @@ FUNNYUVCODE
    }
 }
 
-static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *filterSize, int xInc,
+static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
                                      int srcW, int dstW, int filterAlign, int one)
 {
        int i;
+       double filter[8000];
 #ifdef HAVE_MMX
-       asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions) 
+       asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
 #endif
 
        if(ABS(xInc - 0x10000) <10) // unscaled
@@ -2066,14 +1899,13 @@ static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *
                        if(sws_flags == SWS_BICUBIC)
                        {
                                double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
-//                             int coeff;
-                               int y1,y2,y3,y4;
+                               double y1,y2,y3,y4;
                                double A= -0.75;
                                        // Equation is from VirtualDub
-               y1 = (int)floor(0.5 + (        +     A*d -       2.0*A*d*d +       A*d*d*d) * 16384.0);
-               y2 = (int)floor(0.5 + (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0);
-               y3 = (int)floor(0.5 + (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0);
-               y4 = (int)floor(0.5 + (                  +           A*d*d -       A*d*d*d) * 16384.0);
+                               y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
+                               y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
+                               y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
+                               y4 = (                  +           A*d*d -       A*d*d*d);
 
 //                             printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
                                filter[i*(*filterSize) + 0]= y1;
@@ -2087,8 +1919,7 @@ static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *
                                for(j=0; j<*filterSize; j++)
                                {
                                        double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
-                                       int coeff;
-                                       coeff= (int)(0.5 + (1.0 - d)*(1<<14));
+                                       double coeff= 1.0 - d;
                                        if(coeff<0) coeff=0;
        //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
                                        filter[i*(*filterSize) + j]= coeff;
@@ -2116,24 +1947,22 @@ static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *
                        for(j=0; j<*filterSize; j++)
                        {
                                double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
-                               int coeff;
+                               double coeff;
                                if(sws_flags == SWS_BICUBIC)
                                {
                                        double A= -0.75;
 //                                     d*=2;
                                        // Equation is from VirtualDub
                                        if(d<1.0)
-                                               coeff = (int)floor(0.5 + (1.0 - (A+3.0)*d*d
-                                                       + (A+2.0)*d*d*d) * (1<<14));
+                                               coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
                                        else if(d<2.0)
-                                               coeff = (int)floor(0.5 + (-4.0*A + 8.0*A*d
-                                                       - 5.0*A*d*d + A*d*d*d) * (1<<14));
+                                               coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
                                        else
-                                               coeff=0;
+                                               coeff=0.0;
                                }
                                else
                                {
-                                       coeff= (int)(0.5 + (1.0 - d)*(1<<14));
+                                       coeff= 1.0 - d;
                                        if(coeff<0) coeff=0;
                                }
 //                             if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
@@ -2160,17 +1989,17 @@ static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *
                        filterPos[i]= 0;
                }
 
-               if(filterPos[i] + *filterSize > srcW)
+               if(filterPos[i] + (*filterSize) > srcW)
                {
-                       int shift= filterPos[i] + *filterSize - srcW;
+                       int shift= filterPos[i] + (*filterSize) - srcW;
                        // Move filter coeffs right to compensate for filterPos
-                       for(j=*filterSize-2; j>=0; j--)
+                       for(j=(*filterSize)-2; j>=0; j--)
                        {
-                               int right= MIN(j + shift, *filterSize-1);
+                               int right= MIN(j + shift, (*filterSize)-1);
                                filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
                                filter[i*(*filterSize) +j]=0;
                        }
-                       filterPos[i]= srcW - *filterSize;
+                       filterPos[i]= srcW - (*filterSize);
                }
        }
 
@@ -2190,7 +2019,7 @@ static inline void RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *
                scale/= sum;
                for(j=0; j<*filterSize; j++)
                {
-                       filter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
+                       dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
                }
        }
 }
@@ -2339,17 +2168,29 @@ static int chrBufIndex=0;
 
 static int firstTime=1;
 
-int widthAlign= dstbpp==12 ? 16 : 8;
-if(((dstW + widthAlign-1)&(~(widthAlign-1))) > dststride)
+const int widthAlign= dstbpp==12 ? 16 : 8;
+const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
+const int over= dstbpp==12 ?     (((dstW+15)&(~15))) - dststride
+                               : (((dstW+7)&(~7)))*bytespp - dststride;
+if(dststride%widthAlign !=0 )
 {
-       dstW&= ~(widthAlign-1);
        if(firstTime)
                fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
-                               "SwScaler: ->lowering width to compensate, new width=%d\n"
-                               "SwScaler: ->cannot do aligned memory acesses anymore\n",
-                               widthAlign, dstW);
+                               "SwScaler:          ->cannot do aligned memory acesses anymore\n",
+                               widthAlign);
 }
 
+if(over>0)
+{
+       if(firstTime)
+               fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
+                               "SwScaler:          and dststride is not large enough to handle %d extra bytes\n"
+                               "SwScaler:          ->using unoptimized C version for last line(s)\n",
+                               over);
+}
+
+
+
 //printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
 //printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
 
@@ -2357,9 +2198,11 @@ if(((dstW + widthAlign-1)&(~(widthAlign-1))) > dststride)
 canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
 if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
 {
-       if(firstTime) //FIXME only if verbose ?
+       if(firstTime)
                fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
 }
+#else
+canMMX2BeUsed=0; // should be 0 anyway but ...
 #endif
 
 if(firstTime)
@@ -2398,7 +2241,7 @@ if(firstTime)
 #elif defined (HAVE_MMX)
                fprintf(stderr, "using MMX\n");
 #elif defined (ARCH_X86)
-               fprintf(stderr, "using X86 ASM2\n");
+               fprintf(stderr, "using X86 ASM\n");
 #else
                fprintf(stderr, "using C\n");
 #endif
@@ -2413,13 +2256,15 @@ if(firstTime)
 if(sws_flags==SWS_FAST_BILINEAR)
 {
        if(canMMX2BeUsed)       lumXInc+= 20;
+#ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
        else                    lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
+#endif
 }
 
 if(fullUVIpol && !(dstbpp==12))        chrXInc= lumXInc>>1, chrDstW= dstW;
-else                                   chrXInc= lumXInc,    chrDstW= dstW>>1;
+else                                   chrXInc= lumXInc,    chrDstW= (dstW+1)>>1;
 
-if(dstbpp==12) chrYInc= lumYInc,    chrDstH= dstH>>1;
+if(dstbpp==12) chrYInc= lumYInc,    chrDstH= (dstH+1)>>1;
 else           chrYInc= lumYInc>>1, chrDstH= dstH;
 
   // force calculation of the horizontal interpolation of the first line
@@ -2440,13 +2285,10 @@ else            chrYInc= lumYInc>>1, chrDstH= dstH;
 #endif
                oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
 
-               if(sws_flags != SWS_FAST_BILINEAR)
-               {
-                       RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
-                                          srcW   , dstW   , filterAlign, 1<<14);
-                       RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
-                                          srcW>>1, chrDstW, filterAlign, 1<<14);
-               }
+               RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
+                               srcW   , dstW   , filterAlign, 1<<14);
+               RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
+                               (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
 
 #ifdef HAVE_MMX2
 // cant downscale !!!
@@ -2470,7 +2312,7 @@ else              chrYInc= lumYInc>>1, chrDstH= dstH;
                RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
                                srcH   , dstH,    1, (1<<12)-4);
                RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
-                               srcH>>1, chrDstH, 1, (1<<12)-4);
+                               (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
 
                // Calculate Buffer Sizes so that they wont run out while handling these damn slices
                vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
@@ -2509,6 +2351,74 @@ else             chrYInc= lumYInc>>1, chrDstH= dstH;
 #endif
        }
 
+       if(firstTime && verbose)
+       {
+#ifdef HAVE_MMX2
+               int mmx2=1;
+#else
+               int mmx2=0;
+#endif
+#ifdef HAVE_MMX
+               int mmx=1;
+#else
+               int mmx=0;
+#endif
+
+#ifdef HAVE_MMX
+               if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
+                       printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
+               else
+               {
+                       if(hLumFilterSize==4)
+                               printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
+                       else if(hLumFilterSize==8)
+                               printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
+                       else
+                               printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
+
+                       if(hChrFilterSize==4)
+                               printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
+                       else if(hChrFilterSize==8)
+                               printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
+                       else
+                               printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
+               }
+#elif defined (ARCH_X86)
+               printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
+#else
+               if(sws_flags==SWS_FAST_BILINEAR)
+                       printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
+               else
+                       printf("SwScaler: using C scaler for horizontal scaling\n");
+#endif
+
+               if(dstbpp==12)
+               {
+                       if(vLumFilterSize==1)
+                               printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
+                       else
+                               printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
+               }
+               else
+               {
+                       if(vLumFilterSize==1 && vChrFilterSize==2)
+                               printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
+                                      "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
+                       else if(vLumFilterSize==2 && vChrFilterSize==2)
+                               printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
+                       else
+                               printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
+               }
+
+               if(dstbpp==24)
+                       printf("SwScaler: using %s YV12->BGR24 Converter\n",
+                               mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
+               else
+                       printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
+
+               printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
+       }
+
        lastInLumBuf= -1;
        lastInChrBuf= -1;
   } // if(firstLine)
@@ -2557,7 +2467,7 @@ else              chrYInc= lumYInc>>1, chrDstH= dstH;
                                ASSERT(chrBufIndex < 2*vChrBufSize)
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
-                               RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc);
+                               RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
                                lastInChrBuf++;
                        }
                        //wrap buf index around to stay inside the ring buffer
@@ -2590,7 +2500,7 @@ else              chrYInc= lumYInc>>1, chrDstH= dstH;
                                ASSERT(chrBufIndex < 2*vChrBufSize)
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
-                               RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, srcW>>1, chrXInc);
+                               RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
                                lastInChrBuf++;
                        }
                        //wrap buf index around to stay inside the ring buffer
@@ -2605,7 +2515,8 @@ else              chrYInc= lumYInc>>1, chrDstH= dstH;
                g5Dither= dither8[dstY&1];
                r5Dither= dither8[(dstY+1)&1];
 #endif
-
+           if(dstY < dstH-2 || over<=0)
+           {
                if(dstbpp==12) //YV12
                {
                        if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
@@ -2657,6 +2568,29 @@ else             chrYInc= lumYInc>>1, chrDstH= dstH;
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
                        }
                }
+            }
+           else // hmm looks like we cant use MMX here without overwriting this arrays tail
+           {
+               int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+               int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+               if(dstbpp==12) //YV12
+               {
+                       if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
+                       yuv2yuvXinC(
+                               vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
+                               vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+                               dest, uDest, vDest, dstW);
+               }
+               else
+               {
+                       ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
+                       ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
+                       yuv2rgbXinC(
+                               vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+                               vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+                               dest, dstW, dstbpp);
+               }
+           }
        }
 
 #ifdef HAVE_MMX