]> git.sesse.net Git - ffmpeg/blob - postproc/swscale.c
IF09 is alias for YVU9 (actually it has extra 4th plane containing MC change
[ffmpeg] / postproc / swscale.c
1 /*
2     Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20   supported Input formats: YV12, I420/IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09
21   supported output formats: YV12, I420/IYUV, BGR15, BGR16, BGR24, BGR32, Y8/Y800, YVU9/IF09
22   BGR15/16 support dithering
23   
24   unscaled special converters
25   YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26   YV12/I420/IYUV -> YV12/I420/IYUV
27   YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
28   BGR24 -> BGR32 & RGB24 -> RGB32
29   BGR32 -> BGR24 & RGB32 -> RGB24
30   BGR15 -> BGR16
31 */
32
33 /* 
34 tested special converters
35  YV12/I420 -> BGR16
36  YV12 -> YV12
37  BGR15 -> BGR16
38  BGR16 -> BGR16
39
40 untested special converters
41   YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
42   YV12/I420 -> YV12/I420
43   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
44   BGR24 -> BGR32 & RGB24 -> RGB32
45   BGR32 -> BGR24 & RGB32 -> RGB24
46   BGR24 -> YV12
47 */
48
49 #include <inttypes.h>
50 #include <string.h>
51 #include <math.h>
52 #include <stdio.h>
53 #include "../config.h"
54 #include "../mangle.h"
55 #include <assert.h>
56 #ifdef HAVE_MALLOC_H
57 #include <malloc.h>
58 #else
59 #include <stdlib.h>
60 #endif
61 #include "swscale.h"
62 #include "../cpudetect.h"
63 #include "../bswap.h"
64 #include "../libvo/img_format.h"
65 #include "rgb2rgb.h"
66 #include "../libvo/fastmemcpy.h"
67 #include "../mp_msg.h"
68
69 #define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
70 #define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
71 #define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
72 #define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
73 #define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
74 #define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
75
76 #undef MOVNTQ
77 #undef PAVGB
78
79 //#undef HAVE_MMX2
80 //#define HAVE_3DNOW
81 //#undef HAVE_MMX
82 //#undef ARCH_X86
83 //#define WORDS_BIGENDIAN
84 #define DITHER1XBPP
85
86 #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
87
88 #define RET 0xC3 //near return opcode for X86
89
90 #ifdef MP_DEBUG
91 #define ASSERT(x) assert(x);
92 #else
93 #define ASSERT(x) ;
94 #endif
95
96 #ifdef M_PI
97 #define PI M_PI
98 #else
99 #define PI 3.14159265358979323846
100 #endif
101
102 //FIXME replace this with something faster
103 #define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YVU9)
104 #define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
105 #define isGray(x)      ((x)==IMGFMT_Y800)
106 #define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
107                         || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
108                         || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
109                         || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
110 #define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
111                         || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
112                         || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
113 #define isRGB(x)       (((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
114 #define isBGR(x)       (((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
115 #define isPacked(x)    ((x)==IMGFMT_YUY2 || isRGB(x) || isBGR(x))
116
117 #define RGB2YUV_SHIFT 16
118 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
119 #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
120 #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
121 #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
122 #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
123 #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
124 #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
125 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
126 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
127
128 extern int verbose; // defined in mplayer.c
129 /*
130 NOTES
131 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
132
133 TODO
134 more intelligent missalignment avoidance for the horizontal scaler
135 write special vertical cubic upscale version
136 Optimize C code (yv12 / minmax)
137 add support for packed pixel yuv input & output
138 add support for Y8 output
139 optimize bgr24 & bgr32
140 add BGR4 output support
141 write special BGR->BGR scaler
142 deglobalize yuv2rgb*.c
143 */
144
145 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
146 #define MIN(a,b) ((a) > (b) ? (b) : (a))
147 #define MAX(a,b) ((a) < (b) ? (b) : (a))
148
149 #ifdef ARCH_X86
150 #define CAN_COMPILE_X86_ASM
151 #endif
152
153 #ifdef CAN_COMPILE_X86_ASM
154 static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
155 static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
156 static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
157 static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
158 static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
159 static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
160 static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
161 static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
162 static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
163 static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
164 static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
165 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
166 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
167 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
168 static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
169
170 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
171 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
172 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
173 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
174
175 static uint64_t __attribute__((aligned(8))) dither4[2]={
176         0x0103010301030103LL,
177         0x0200020002000200LL,};
178
179 static uint64_t __attribute__((aligned(8))) dither8[2]={
180         0x0602060206020602LL,
181         0x0004000400040004LL,};
182
183 static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
184 static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
185 static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
186 static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
187 static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
188 static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
189
190 static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
191 static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
192 static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
193
194 #ifdef FAST_BGR2YV12
195 static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
196 static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
197 static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
198 #else
199 static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
200 static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
201 static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
202 #endif
203 static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
204 static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
205 static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
206
207 // FIXME remove
208 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
209 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
210 #endif
211
212 // clipping helper table for C implementations:
213 static unsigned char clip_table[768];
214
215 static unsigned short clip_table16b[768];
216 static unsigned short clip_table16g[768];
217 static unsigned short clip_table16r[768];
218 static unsigned short clip_table15b[768];
219 static unsigned short clip_table15g[768];
220 static unsigned short clip_table15r[768];
221
222 // yuv->rgb conversion tables:
223 static    int yuvtab_2568[256];
224 static    int yuvtab_3343[256];
225 static    int yuvtab_0c92[256];
226 static    int yuvtab_1a1e[256];
227 static    int yuvtab_40cf[256];
228 // Needed for cubic scaler to catch overflows
229 static    int clip_yuvtab_2568[768];
230 static    int clip_yuvtab_3343[768];
231 static    int clip_yuvtab_0c92[768];
232 static    int clip_yuvtab_1a1e[768];
233 static    int clip_yuvtab_40cf[768];
234
235 //global sws_flags from the command line
236 int sws_flags=2;
237
238 //global srcFilter
239 SwsFilter src_filter= {NULL, NULL, NULL, NULL};
240
241 float sws_lum_gblur= 0.0;
242 float sws_chr_gblur= 0.0;
243 int sws_chr_vshift= 0;
244 int sws_chr_hshift= 0;
245 float sws_chr_sharpen= 0.0;
246 float sws_lum_sharpen= 0.0;
247
248 /* cpuCaps combined from cpudetect and whats actually compiled in
249    (if there is no support for something compiled in it wont appear here) */
250 static CpuCaps cpuCaps;
251
252 void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
253              int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
254
255 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
256
257 #ifdef CAN_COMPILE_X86_ASM
258 void in_asm_used_var_warning_killer()
259 {
260  volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
261  bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
262  M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
263  if(i) i=0;
264 }
265 #endif
266
267 static int testFormat[]={
268 IMGFMT_YVU9,
269 IMGFMT_YV12,
270 //IMGFMT_IYUV,
271 IMGFMT_I420,
272 IMGFMT_BGR15,
273 IMGFMT_BGR16,
274 IMGFMT_BGR24,
275 IMGFMT_BGR32,
276 //IMGFMT_Y8,
277 IMGFMT_Y800,
278 //IMGFMT_YUY2,
279 0
280 };
281
282 static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
283         int x,y;
284         uint64_t ssd=0;
285
286         for(y=0; y<h; y++){
287                 for(x=0; x<w; x++){
288                         int d= src1[x + y*stride1] - src2[x + y*stride2];
289                         ssd+= d*d;
290                 }
291         }
292         return ssd;
293 }
294
295 // test by ref -> src -> dst -> out & compare out against ref
296 // ref & out are YV12
297 static void doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat, 
298                    int srcW, int srcH, int dstW, int dstH, int flags){
299         uint8_t *src[3];
300         uint8_t *dst[3];
301         uint8_t *out[3];
302         int srcStride[3], dstStride[3];
303         int i;
304         uint64_t ssdY, ssdU, ssdV;
305         SwsContext *srcContext, *dstContext, *outContext;
306         
307         for(i=0; i<3; i++){
308                 srcStride[i]= srcW*4;
309                 dstStride[i]= dstW*4;
310                 src[i]= malloc(srcStride[i]*srcH);
311                 dst[i]= malloc(dstStride[i]*dstH);
312                 out[i]= malloc(refStride[i]*h);
313         }
314
315         srcContext= getSwsContext(w, h, IMGFMT_YV12, srcW, srcH, srcFormat, flags, NULL, NULL);
316         dstContext= getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL);
317         outContext= getSwsContext(dstW, dstH, dstFormat, w, h, IMGFMT_YV12, flags, NULL, NULL);
318         if(srcContext==NULL ||dstContext==NULL ||outContext==NULL){
319                 printf("Failed allocating swsContext\n");
320                 goto end;
321         }
322 //      printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2],
323 //              (int)src[0], (int)src[1], (int)src[2]);
324
325         srcContext->swScale(srcContext, ref, refStride, 0, h   , src, srcStride);
326         dstContext->swScale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
327         outContext->swScale(outContext, dst, dstStride, 0, dstH, out, refStride);
328              
329         ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
330         ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
331         ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);
332         
333         if(isGray(srcFormat) || isGray(dstFormat)) ssdU=ssdV=0; //FIXME check that output is really gray
334         
335         ssdY/= w*h;
336         ssdU/= w*h/4;
337         ssdV/= w*h/4;
338         
339         if(ssdY>100 || ssdU>50 || ssdV>50){
340                 printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n", 
341                         vo_format_name(srcFormat), srcW, srcH, 
342                         vo_format_name(dstFormat), dstW, dstH,
343                         flags,
344                         ssdY, ssdU, ssdV);
345         }
346
347         end:
348         
349         freeSwsContext(srcContext);
350         freeSwsContext(dstContext);
351         freeSwsContext(outContext);
352
353         for(i=0; i<3; i++){
354                 free(src[i]);
355                 free(dst[i]);
356                 free(out[i]);
357         }
358 }
359
360 static void selfTest(uint8_t *src[3], int stride[3], int w, int h){
361         int srcFormat, dstFormat, srcFormatIndex, dstFormatIndex;
362         int srcW, srcH, dstW, dstH;
363         int flags;
364
365         for(srcFormatIndex=0; ;srcFormatIndex++){
366                 srcFormat= testFormat[srcFormatIndex];
367                 if(!srcFormat) break;
368                 for(dstFormatIndex=0; ;dstFormatIndex++){
369                         dstFormat= testFormat[dstFormatIndex];
370                         if(!dstFormat) break;
371                         if(!isSupportedOut(dstFormat)) continue;
372
373                         srcW= w+w/3;
374                         srcH= h+h/3;
375                         for(dstW=w; dstW<w*2; dstW+= dstW/3){
376                                 for(dstH=h; dstH<h*2; dstH+= dstH/3){
377                                         for(flags=1; flags<33; flags*=2)
378                                                 doTest(src, stride, w, h, srcFormat, dstFormat,
379                                                         srcW, srcH, dstW, dstH, flags);
380                                 }
381                         }
382                 }
383         }
384 }
385
386 static inline void yuv2yuvXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
387                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
388                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest)
389 {
390         //FIXME Optimize (just quickly writen not opti..)
391         int i;
392         for(i=0; i<c->dstW; i++)
393         {
394                 int val=0;
395                 int j;
396                 for(j=0; j<lumFilterSize; j++)
397                         val += lumSrc[j][i] * lumFilter[j];
398
399                 dest[i]= MIN(MAX(val>>19, 0), 255);
400         }
401
402         if(uDest != NULL)
403                 for(i=0; i<c->chrDstW; i++)
404                 {
405                         int u=0;
406                         int v=0;
407                         int j;
408                         for(j=0; j<chrFilterSize; j++)
409                         {
410                                 u += chrSrc[j][i] * chrFilter[j];
411                                 v += chrSrc[j][i + 2048] * chrFilter[j];
412                         }
413
414                         uDest[i]= MIN(MAX(u>>19, 0), 255);
415                         vDest[i]= MIN(MAX(v>>19, 0), 255);
416                 }
417 }
418
419 static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
420                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
421                                     uint8_t *dest, int dstW, int dstFormat)
422 {
423         if(dstFormat==IMGFMT_BGR32)
424         {
425                 int i;
426 #ifdef WORDS_BIGENDIAN
427         dest++;
428 #endif
429                 for(i=0; i<(dstW>>1); i++){
430                         int j;
431                         int Y1=0;
432                         int Y2=0;
433                         int U=0;
434                         int V=0;
435                         int Cb, Cr, Cg;
436                         for(j=0; j<lumFilterSize; j++)
437                         {
438                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
439                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
440                         }
441                         for(j=0; j<chrFilterSize; j++)
442                         {
443                                 U += chrSrc[j][i] * chrFilter[j];
444                                 V += chrSrc[j][i+2048] * chrFilter[j];
445                         }
446                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
447                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
448                         U >>= 19;
449                         V >>= 19;
450
451                         Cb= clip_yuvtab_40cf[U+ 256];
452                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
453                         Cr= clip_yuvtab_3343[V+ 256];
454
455                         dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
456                         dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
457                         dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
458
459                         dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
460                         dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
461                         dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
462                 }
463         }
464         else if(dstFormat==IMGFMT_BGR24)
465         {
466                 int i;
467                 for(i=0; i<(dstW>>1); i++){
468                         int j;
469                         int Y1=0;
470                         int Y2=0;
471                         int U=0;
472                         int V=0;
473                         int Cb, Cr, Cg;
474                         for(j=0; j<lumFilterSize; j++)
475                         {
476                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
477                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
478                         }
479                         for(j=0; j<chrFilterSize; j++)
480                         {
481                                 U += chrSrc[j][i] * chrFilter[j];
482                                 V += chrSrc[j][i+2048] * chrFilter[j];
483                         }
484                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
485                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
486                         U >>= 19;
487                         V >>= 19;
488
489                         Cb= clip_yuvtab_40cf[U+ 256];
490                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
491                         Cr= clip_yuvtab_3343[V+ 256];
492
493                         dest[0]=clip_table[((Y1 + Cb) >>13)];
494                         dest[1]=clip_table[((Y1 + Cg) >>13)];
495                         dest[2]=clip_table[((Y1 + Cr) >>13)];
496
497                         dest[3]=clip_table[((Y2 + Cb) >>13)];
498                         dest[4]=clip_table[((Y2 + Cg) >>13)];
499                         dest[5]=clip_table[((Y2 + Cr) >>13)];
500                         dest+=6;
501                 }
502         }
503         else if(dstFormat==IMGFMT_BGR16)
504         {
505                 int i;
506 #ifdef DITHER1XBPP
507                 static int ditherb1=1<<14;
508                 static int ditherg1=1<<13;
509                 static int ditherr1=2<<14;
510                 static int ditherb2=3<<14;
511                 static int ditherg2=3<<13;
512                 static int ditherr2=0<<14;
513
514                 ditherb1 ^= (1^2)<<14;
515                 ditherg1 ^= (1^2)<<13;
516                 ditherr1 ^= (1^2)<<14;
517                 ditherb2 ^= (3^0)<<14;
518                 ditherg2 ^= (3^0)<<13;
519                 ditherr2 ^= (3^0)<<14;
520 #else
521                 const int ditherb1=0;
522                 const int ditherg1=0;
523                 const int ditherr1=0;
524                 const int ditherb2=0;
525                 const int ditherg2=0;
526                 const int ditherr2=0;
527 #endif
528                 for(i=0; i<(dstW>>1); i++){
529                         int j;
530                         int Y1=0;
531                         int Y2=0;
532                         int U=0;
533                         int V=0;
534                         int Cb, Cr, Cg;
535                         for(j=0; j<lumFilterSize; j++)
536                         {
537                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
538                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
539                         }
540                         for(j=0; j<chrFilterSize; j++)
541                         {
542                                 U += chrSrc[j][i] * chrFilter[j];
543                                 V += chrSrc[j][i+2048] * chrFilter[j];
544                         }
545                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
546                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
547                         U >>= 19;
548                         V >>= 19;
549
550                         Cb= clip_yuvtab_40cf[U+ 256];
551                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
552                         Cr= clip_yuvtab_3343[V+ 256];
553
554                         ((uint16_t*)dest)[2*i] =
555                                 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
556                                 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
557                                 clip_table16r[(Y1 + Cr + ditherr1) >>13];
558
559                         ((uint16_t*)dest)[2*i+1] =
560                                 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
561                                 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
562                                 clip_table16r[(Y2 + Cr + ditherr2) >>13];
563                 }
564         }
565         else if(dstFormat==IMGFMT_BGR15)
566         {
567                 int i;
568 #ifdef DITHER1XBPP
569                 static int ditherb1=1<<14;
570                 static int ditherg1=1<<14;
571                 static int ditherr1=2<<14;
572                 static int ditherb2=3<<14;
573                 static int ditherg2=3<<14;
574                 static int ditherr2=0<<14;
575
576                 ditherb1 ^= (1^2)<<14;
577                 ditherg1 ^= (1^2)<<14;
578                 ditherr1 ^= (1^2)<<14;
579                 ditherb2 ^= (3^0)<<14;
580                 ditherg2 ^= (3^0)<<14;
581                 ditherr2 ^= (3^0)<<14;
582 #else
583                 const int ditherb1=0;
584                 const int ditherg1=0;
585                 const int ditherr1=0;
586                 const int ditherb2=0;
587                 const int ditherg2=0;
588                 const int ditherr2=0;
589 #endif
590                 for(i=0; i<(dstW>>1); i++){
591                         int j;
592                         int Y1=0;
593                         int Y2=0;
594                         int U=0;
595                         int V=0;
596                         int Cb, Cr, Cg;
597                         for(j=0; j<lumFilterSize; j++)
598                         {
599                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
600                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
601                         }
602                         for(j=0; j<chrFilterSize; j++)
603                         {
604                                 U += chrSrc[j][i] * chrFilter[j];
605                                 V += chrSrc[j][i+2048] * chrFilter[j];
606                         }
607                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
608                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
609                         U >>= 19;
610                         V >>= 19;
611
612                         Cb= clip_yuvtab_40cf[U+ 256];
613                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
614                         Cr= clip_yuvtab_3343[V+ 256];
615
616                         ((uint16_t*)dest)[2*i] =
617                                 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
618                                 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
619                                 clip_table15r[(Y1 + Cr + ditherr1) >>13];
620
621                         ((uint16_t*)dest)[2*i+1] =
622                                 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
623                                 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
624                                 clip_table15r[(Y2 + Cr + ditherr2) >>13];
625                 }
626         }
627 }
628
629
630 //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
631 //Plain C versions
632 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
633 #define COMPILE_C
634 #endif
635
636 #ifdef CAN_COMPILE_X86_ASM
637
638 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
639 #define COMPILE_MMX
640 #endif
641
642 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
643 #define COMPILE_MMX2
644 #endif
645
646 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
647 #define COMPILE_3DNOW
648 #endif
649 #endif //CAN_COMPILE_X86_ASM
650
651 #undef HAVE_MMX
652 #undef HAVE_MMX2
653 #undef HAVE_3DNOW
654
655 #ifdef COMPILE_C
656 #undef HAVE_MMX
657 #undef HAVE_MMX2
658 #undef HAVE_3DNOW
659 #define RENAME(a) a ## _C
660 #include "swscale_template.c"
661 #endif
662
663 #ifdef CAN_COMPILE_X86_ASM
664
665 //X86 versions
666 /*
667 #undef RENAME
668 #undef HAVE_MMX
669 #undef HAVE_MMX2
670 #undef HAVE_3DNOW
671 #define ARCH_X86
672 #define RENAME(a) a ## _X86
673 #include "swscale_template.c"
674 */
675 //MMX versions
676 #ifdef COMPILE_MMX
677 #undef RENAME
678 #define HAVE_MMX
679 #undef HAVE_MMX2
680 #undef HAVE_3DNOW
681 #define RENAME(a) a ## _MMX
682 #include "swscale_template.c"
683 #endif
684
685 //MMX2 versions
686 #ifdef COMPILE_MMX2
687 #undef RENAME
688 #define HAVE_MMX
689 #define HAVE_MMX2
690 #undef HAVE_3DNOW
691 #define RENAME(a) a ## _MMX2
692 #include "swscale_template.c"
693 #endif
694
695 //3DNOW versions
696 #ifdef COMPILE_3DNOW
697 #undef RENAME
698 #define HAVE_MMX
699 #undef HAVE_MMX2
700 #define HAVE_3DNOW
701 #define RENAME(a) a ## _3DNow
702 #include "swscale_template.c"
703 #endif
704
705 #endif //CAN_COMPILE_X86_ASM
706
707 // minor note: the HAVE_xyz is messed up after that line so dont use it
708
709
710 // old global scaler, dont use for new code
711 // will use sws_flags from the command line
712 void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
713                              int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
714                              int srcW, int srcH, int dstW, int dstH){
715
716         static SwsContext *context=NULL;
717         int dstFormat;
718         int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
719
720         switch(dstbpp)
721         {
722                 case 8 : dstFormat= IMGFMT_Y8;          break;
723                 case 12: dstFormat= IMGFMT_YV12;        break;
724                 case 15: dstFormat= IMGFMT_BGR15;       break;
725                 case 16: dstFormat= IMGFMT_BGR16;       break;
726                 case 24: dstFormat= IMGFMT_BGR24;       break;
727                 case 32: dstFormat= IMGFMT_BGR32;       break;
728                 default: return;
729         }
730
731         if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
732
733         context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
734 }
735
736 // will use sws_flags & src_filter (from cmd line)
737 SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
738 {
739         int flags=0;
740         static int firstTime=1;
741
742 #ifdef ARCH_X86
743         if(gCpuCaps.hasMMX)
744                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
745 #endif
746         if(firstTime)
747         {
748                 firstTime=0;
749                 flags= SWS_PRINT_INFO;
750         }
751         else if(verbose>1) flags= SWS_PRINT_INFO;
752
753         if(src_filter.lumH) freeVec(src_filter.lumH);
754         if(src_filter.lumV) freeVec(src_filter.lumV);
755         if(src_filter.chrH) freeVec(src_filter.chrH);
756         if(src_filter.chrV) freeVec(src_filter.chrV);
757
758         if(sws_lum_gblur!=0.0){
759                 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
760                 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
761         }else{
762                 src_filter.lumH= getIdentityVec();
763                 src_filter.lumV= getIdentityVec();
764         }
765
766         if(sws_chr_gblur!=0.0){
767                 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
768                 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
769         }else{
770                 src_filter.chrH= getIdentityVec();
771                 src_filter.chrV= getIdentityVec();
772         }
773
774         if(sws_chr_sharpen!=0.0){
775                 SwsVector *g= getConstVec(-1.0, 3);
776                 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
777                 g->coeff[1]=2.0;
778                 addVec(id, g);
779                 convVec(src_filter.chrH, id);
780                 convVec(src_filter.chrV, id);
781                 freeVec(g);
782                 freeVec(id);
783         }
784
785         if(sws_lum_sharpen!=0.0){
786                 SwsVector *g= getConstVec(-1.0, 3);
787                 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
788                 g->coeff[1]=2.0;
789                 addVec(id, g);
790                 convVec(src_filter.lumH, id);
791                 convVec(src_filter.lumV, id);
792                 freeVec(g);
793                 freeVec(id);
794         }
795
796         if(sws_chr_hshift)
797                 shiftVec(src_filter.chrH, sws_chr_hshift);
798
799         if(sws_chr_vshift)
800                 shiftVec(src_filter.chrV, sws_chr_vshift);
801
802         normalizeVec(src_filter.chrH, 1.0);
803         normalizeVec(src_filter.chrV, 1.0);
804         normalizeVec(src_filter.lumH, 1.0);
805         normalizeVec(src_filter.lumV, 1.0);
806
807         if(verbose > 1) printVec(src_filter.chrH);
808         if(verbose > 1) printVec(src_filter.lumH);
809
810         switch(sws_flags)
811         {
812                 case 0: flags|= SWS_FAST_BILINEAR; break;
813                 case 1: flags|= SWS_BILINEAR; break;
814                 case 2: flags|= SWS_BICUBIC; break;
815                 case 3: flags|= SWS_X; break;
816                 case 4: flags|= SWS_POINT; break;
817                 case 5: flags|= SWS_AREA; break;
818                 default:flags|= SWS_BILINEAR; break;
819         }
820
821         return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
822 }
823
824
825 static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
826                               int srcW, int dstW, int filterAlign, int one, int flags,
827                               SwsVector *srcFilter, SwsVector *dstFilter)
828 {
829         int i;
830         int filterSize;
831         int filter2Size;
832         int minFilterSize;
833         double *filter=NULL;
834         double *filter2=NULL;
835 #ifdef ARCH_X86
836         if(gCpuCaps.hasMMX)
837                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
838 #endif
839
840         // Note the +1 is for the MMXscaler which reads over the end
841         *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
842
843         if(ABS(xInc - 0x10000) <10) // unscaled
844         {
845                 int i;
846                 filterSize= 1;
847                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
848                 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
849
850                 for(i=0; i<dstW; i++)
851                 {
852                         filter[i*filterSize]=1;
853                         (*filterPos)[i]=i;
854                 }
855
856         }
857         else if(flags&SWS_POINT) // lame looking point sampling mode
858         {
859                 int i;
860                 int xDstInSrc;
861                 filterSize= 1;
862                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
863                 
864                 xDstInSrc= xInc/2 - 0x8000;
865                 for(i=0; i<dstW; i++)
866                 {
867                         int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
868
869                         (*filterPos)[i]= xx;
870                         filter[i]= 1.0;
871                         xDstInSrc+= xInc;
872                 }
873         }
874         else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
875         {
876                 int i;
877                 int xDstInSrc;
878                 if     (flags&SWS_BICUBIC) filterSize= 4;
879                 else if(flags&SWS_X      ) filterSize= 4;
880                 else                       filterSize= 2; // SWS_BILINEAR / SWS_AREA 
881                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
882
883                 xDstInSrc= xInc/2 - 0x8000;
884                 for(i=0; i<dstW; i++)
885                 {
886                         int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
887                         int j;
888
889                         (*filterPos)[i]= xx;
890                         if((flags & SWS_BICUBIC) || (flags & SWS_X))
891                         {
892                                 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
893                                 double y1,y2,y3,y4;
894                                 double A= -0.6;
895                                 if(flags & SWS_BICUBIC){
896                                                 // Equation is from VirtualDub
897                                         y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
898                                         y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
899                                         y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
900                                         y4 = (                  +           A*d*d -       A*d*d*d);
901                                 }else{
902                                                 // cubic interpolation (derived it myself)
903                                         y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
904                                         y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
905                                         y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
906                                         y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
907                                 }
908
909                                 filter[i*filterSize + 0]= y1;
910                                 filter[i*filterSize + 1]= y2;
911                                 filter[i*filterSize + 2]= y3;
912                                 filter[i*filterSize + 3]= y4;
913                         }
914                         else
915                         {
916                                 //Bilinear upscale / linear interpolate / Area averaging
917                                 for(j=0; j<filterSize; j++)
918                                 {
919                                         double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
920                                         double coeff= 1.0 - d;
921                                         if(coeff<0) coeff=0;
922                                         filter[i*filterSize + j]= coeff;
923                                         xx++;
924                                 }
925                         }
926                         xDstInSrc+= xInc;
927                 }
928         }
929         else // downscale
930         {
931                 int xDstInSrc;
932                 ASSERT(dstW <= srcW)
933
934                 if(flags&SWS_BICUBIC)   filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
935                 else if(flags&SWS_X)    filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
936                 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
937                 else /* BILINEAR */     filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
938                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
939
940                 xDstInSrc= xInc/2 - 0x8000;
941                 for(i=0; i<dstW; i++)
942                 {
943                         int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
944                         int j;
945                         (*filterPos)[i]= xx;
946                         for(j=0; j<filterSize; j++)
947                         {
948                                 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
949                                 double coeff;
950                                 if((flags & SWS_BICUBIC) || (flags & SWS_X))
951                                 {
952                                         double A= -0.75;
953 //                                      d*=2;
954                                         // Equation is from VirtualDub
955                                         if(d<1.0)
956                                                 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
957                                         else if(d<2.0)
958                                                 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
959                                         else
960                                                 coeff=0.0;
961                                 }
962                                 else if(flags & SWS_AREA)
963                                 {
964                                         double srcPixelSize= (1<<16)/(double)xInc;
965                                         if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
966                                         else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
967                                         else coeff=0.0;
968                                 }
969                                 else
970                                 {
971                                         coeff= 1.0 - d;
972                                         if(coeff<0) coeff=0;
973                                 }
974                                 filter[i*filterSize + j]= coeff;
975                                 xx++;
976                         }
977                         xDstInSrc+= xInc;
978                 }
979         }
980
981         /* apply src & dst Filter to filter -> filter2
982            free(filter);
983         */
984         ASSERT(filterSize>0)
985         filter2Size= filterSize;
986         if(srcFilter) filter2Size+= srcFilter->length - 1;
987         if(dstFilter) filter2Size+= dstFilter->length - 1;
988         ASSERT(filter2Size>0)
989         filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
990
991         for(i=0; i<dstW; i++)
992         {
993                 int j;
994                 SwsVector scaleFilter;
995                 SwsVector *outVec;
996
997                 scaleFilter.coeff= filter + i*filterSize;
998                 scaleFilter.length= filterSize;
999
1000                 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
1001                 else          outVec= &scaleFilter;
1002
1003                 ASSERT(outVec->length == filter2Size)
1004                 //FIXME dstFilter
1005
1006                 for(j=0; j<outVec->length; j++)
1007                 {
1008                         filter2[i*filter2Size + j]= outVec->coeff[j];
1009                 }
1010
1011                 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
1012
1013                 if(outVec != &scaleFilter) freeVec(outVec);
1014         }
1015         free(filter); filter=NULL;
1016
1017         /* try to reduce the filter-size (step1 find size and shift left) */
1018         // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
1019         minFilterSize= 0;
1020         for(i=dstW-1; i>=0; i--)
1021         {
1022                 int min= filter2Size;
1023                 int j;
1024                 double cutOff=0.0;
1025
1026                 /* get rid off near zero elements on the left by shifting left */
1027                 for(j=0; j<filter2Size; j++)
1028                 {
1029                         int k;
1030                         cutOff += ABS(filter2[i*filter2Size]);
1031
1032                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1033
1034                         /* preserve Monotonicity because the core cant handle the filter otherwise */
1035                         if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
1036
1037                         // Move filter coeffs left
1038                         for(k=1; k<filter2Size; k++)
1039                                 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
1040                         filter2[i*filter2Size + k - 1]= 0.0;
1041                         (*filterPos)[i]++;
1042                 }
1043
1044                 cutOff=0.0;
1045                 /* count near zeros on the right */
1046                 for(j=filter2Size-1; j>0; j--)
1047                 {
1048                         cutOff += ABS(filter2[i*filter2Size + j]);
1049
1050                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1051                         min--;
1052                 }
1053
1054                 if(min>minFilterSize) minFilterSize= min;
1055         }
1056
1057         ASSERT(minFilterSize > 0)
1058         filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
1059         ASSERT(filterSize > 0)
1060         filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
1061         *outFilterSize= filterSize;
1062
1063         if(flags&SWS_PRINT_INFO)
1064                 MSG_INFO("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
1065         /* try to reduce the filter-size (step2 reduce it) */
1066         for(i=0; i<dstW; i++)
1067         {
1068                 int j;
1069
1070                 for(j=0; j<filterSize; j++)
1071                 {
1072                         if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
1073                         else               filter[i*filterSize + j]= filter2[i*filter2Size + j];
1074                 }
1075         }
1076         free(filter2); filter2=NULL;
1077         
1078
1079         //FIXME try to align filterpos if possible
1080
1081         //fix borders
1082         for(i=0; i<dstW; i++)
1083         {
1084                 int j;
1085                 if((*filterPos)[i] < 0)
1086                 {
1087                         // Move filter coeffs left to compensate for filterPos
1088                         for(j=1; j<filterSize; j++)
1089                         {
1090                                 int left= MAX(j + (*filterPos)[i], 0);
1091                                 filter[i*filterSize + left] += filter[i*filterSize + j];
1092                                 filter[i*filterSize + j]=0;
1093                         }
1094                         (*filterPos)[i]= 0;
1095                 }
1096
1097                 if((*filterPos)[i] + filterSize > srcW)
1098                 {
1099                         int shift= (*filterPos)[i] + filterSize - srcW;
1100                         // Move filter coeffs right to compensate for filterPos
1101                         for(j=filterSize-2; j>=0; j--)
1102                         {
1103                                 int right= MIN(j + shift, filterSize-1);
1104                                 filter[i*filterSize +right] += filter[i*filterSize +j];
1105                                 filter[i*filterSize +j]=0;
1106                         }
1107                         (*filterPos)[i]= srcW - filterSize;
1108                 }
1109         }
1110
1111         // Note the +1 is for the MMXscaler which reads over the end
1112         *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
1113         memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
1114
1115         /* Normalize & Store in outFilter */
1116         for(i=0; i<dstW; i++)
1117         {
1118                 int j;
1119                 double sum=0;
1120                 double scale= one;
1121                 for(j=0; j<filterSize; j++)
1122                 {
1123                         sum+= filter[i*filterSize + j];
1124                 }
1125                 scale/= sum;
1126                 for(j=0; j<filterSize; j++)
1127                 {
1128                         (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
1129                 }
1130         }
1131         
1132         (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1133         for(i=0; i<*outFilterSize; i++)
1134         {
1135                 int j= dstW*(*outFilterSize);
1136                 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1137         }
1138
1139         free(filter);
1140 }
1141
1142 #ifdef ARCH_X86
1143 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
1144 {
1145         uint8_t *fragmentA;
1146         int imm8OfPShufW1A;
1147         int imm8OfPShufW2A;
1148         int fragmentLengthA;
1149         uint8_t *fragmentB;
1150         int imm8OfPShufW1B;
1151         int imm8OfPShufW2B;
1152         int fragmentLengthB;
1153         int fragmentPos;
1154
1155         int xpos, i;
1156
1157         // create an optimized horizontal scaling routine
1158
1159         //code fragment
1160
1161         asm volatile(
1162                 "jmp 9f                         \n\t"
1163         // Begin
1164                 "0:                             \n\t"
1165                 "movq (%%edx, %%eax), %%mm3     \n\t" 
1166                 "movd (%%ecx, %%esi), %%mm0     \n\t" 
1167                 "movd 1(%%ecx, %%esi), %%mm1    \n\t"
1168                 "punpcklbw %%mm7, %%mm1         \n\t"
1169                 "punpcklbw %%mm7, %%mm0         \n\t"
1170                 "pshufw $0xFF, %%mm1, %%mm1     \n\t"
1171                 "1:                             \n\t"
1172                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
1173                 "2:                             \n\t"
1174                 "psubw %%mm1, %%mm0             \n\t"
1175                 "movl 8(%%ebx, %%eax), %%esi    \n\t"
1176                 "pmullw %%mm3, %%mm0            \n\t"
1177                 "psllw $7, %%mm1                \n\t"
1178                 "paddw %%mm1, %%mm0             \n\t"
1179
1180                 "movq %%mm0, (%%edi, %%eax)     \n\t"
1181
1182                 "addl $8, %%eax                 \n\t"
1183         // End
1184                 "9:                             \n\t"
1185 //              "int $3\n\t"
1186                 "leal 0b, %0                    \n\t"
1187                 "leal 1b, %1                    \n\t"
1188                 "leal 2b, %2                    \n\t"
1189                 "decl %1                        \n\t"
1190                 "decl %2                        \n\t"
1191                 "subl %0, %1                    \n\t"
1192                 "subl %0, %2                    \n\t"
1193                 "leal 9b, %3                    \n\t"
1194                 "subl %0, %3                    \n\t"
1195
1196
1197                 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1198                 "=r" (fragmentLengthA)
1199         );
1200
1201         asm volatile(
1202                 "jmp 9f                         \n\t"
1203         // Begin
1204                 "0:                             \n\t"
1205                 "movq (%%edx, %%eax), %%mm3     \n\t" 
1206                 "movd (%%ecx, %%esi), %%mm0     \n\t" 
1207                 "punpcklbw %%mm7, %%mm0         \n\t"
1208                 "pshufw $0xFF, %%mm0, %%mm1     \n\t"
1209                 "1:                             \n\t"
1210                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
1211                 "2:                             \n\t"
1212                 "psubw %%mm1, %%mm0             \n\t"
1213                 "movl 8(%%ebx, %%eax), %%esi    \n\t"
1214                 "pmullw %%mm3, %%mm0            \n\t"
1215                 "psllw $7, %%mm1                \n\t"
1216                 "paddw %%mm1, %%mm0             \n\t"
1217
1218                 "movq %%mm0, (%%edi, %%eax)     \n\t"
1219
1220                 "addl $8, %%eax                 \n\t"
1221         // End
1222                 "9:                             \n\t"
1223 //              "int $3\n\t"
1224                 "leal 0b, %0                    \n\t"
1225                 "leal 1b, %1                    \n\t"
1226                 "leal 2b, %2                    \n\t"
1227                 "decl %1                        \n\t"
1228                 "decl %2                        \n\t"
1229                 "subl %0, %1                    \n\t"
1230                 "subl %0, %2                    \n\t"
1231                 "leal 9b, %3                    \n\t"
1232                 "subl %0, %3                    \n\t"
1233
1234
1235                 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1236                 "=r" (fragmentLengthB)
1237         );
1238
1239         xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1240         fragmentPos=0;
1241         
1242         for(i=0; i<dstW/numSplits; i++)
1243         {
1244                 int xx=xpos>>16;
1245
1246                 if((i&3) == 0)
1247                 {
1248                         int a=0;
1249                         int b=((xpos+xInc)>>16) - xx;
1250                         int c=((xpos+xInc*2)>>16) - xx;
1251                         int d=((xpos+xInc*3)>>16) - xx;
1252
1253                         filter[i  ] = (( xpos         & 0xFFFF) ^ 0xFFFF)>>9;
1254                         filter[i+1] = (((xpos+xInc  ) & 0xFFFF) ^ 0xFFFF)>>9;
1255                         filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1256                         filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1257                         filterPos[i/2]= xx;
1258
1259                         if(d+1<4)
1260                         {
1261                                 int maxShift= 3-(d+1);
1262                                 int shift=0;
1263
1264                                 memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
1265
1266                                 funnyCode[fragmentPos + imm8OfPShufW1B]=
1267                                         (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
1268                                 funnyCode[fragmentPos + imm8OfPShufW2B]=
1269                                         a | (b<<2) | (c<<4) | (d<<6);
1270
1271                                 if(i+3>=dstW) shift=maxShift; //avoid overread
1272                                 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1273
1274                                 if(shift && i>=shift)
1275                                 {
1276                                         funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
1277                                         funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
1278                                         filterPos[i/2]-=shift;
1279                                 }
1280
1281                                 fragmentPos+= fragmentLengthB;
1282                         }
1283                         else
1284                         {
1285                                 int maxShift= 3-d;
1286                                 int shift=0;
1287
1288                                 memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
1289
1290                                 funnyCode[fragmentPos + imm8OfPShufW1A]=
1291                                 funnyCode[fragmentPos + imm8OfPShufW2A]=
1292                                         a | (b<<2) | (c<<4) | (d<<6);
1293
1294                                 if(i+4>=dstW) shift=maxShift; //avoid overread
1295                                 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
1296
1297                                 if(shift && i>=shift)
1298                                 {
1299                                         funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
1300                                         funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
1301                                         filterPos[i/2]-=shift;
1302                                 }
1303
1304                                 fragmentPos+= fragmentLengthA;
1305                         }
1306
1307                         funnyCode[fragmentPos]= RET;
1308                 }
1309                 xpos+=xInc;
1310         }
1311         filterPos[i/2]= xpos>>16; // needed to jump to the next part
1312 }
1313 #endif // ARCH_X86
1314
1315 //FIXME remove
1316 void SwScale_Init(){
1317 }
1318
1319 static void globalInit(){
1320     // generating tables:
1321     int i;
1322     for(i=0; i<768; i++){
1323         int c= MIN(MAX(i-256, 0), 255);
1324         clip_table[i]=c;
1325         yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1326         yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1327         yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1328         yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1329         yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1330     }
1331
1332     for(i=0; i<768; i++)
1333     {
1334         int v= clip_table[i];
1335         clip_table16b[i]=  v>>3;
1336         clip_table16g[i]= (v<<3)&0x07E0;
1337         clip_table16r[i]= (v<<8)&0xF800;
1338         clip_table15b[i]=  v>>3;
1339         clip_table15g[i]= (v<<2)&0x03E0;
1340         clip_table15r[i]= (v<<7)&0x7C00;
1341     }
1342
1343 cpuCaps= gCpuCaps;
1344
1345 #ifdef RUNTIME_CPUDETECT
1346 #ifdef CAN_COMPILE_X86_ASM
1347         // ordered per speed fasterst first
1348         if(gCpuCaps.hasMMX2)
1349                 swScale= swScale_MMX2;
1350         else if(gCpuCaps.has3DNow)
1351                 swScale= swScale_3DNow;
1352         else if(gCpuCaps.hasMMX)
1353                 swScale= swScale_MMX;
1354         else
1355                 swScale= swScale_C;
1356
1357 #else
1358         swScale= swScale_C;
1359         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1360 #endif
1361 #else //RUNTIME_CPUDETECT
1362 #ifdef HAVE_MMX2
1363         swScale= swScale_MMX2;
1364         cpuCaps.has3DNow = 0;
1365 #elif defined (HAVE_3DNOW)
1366         swScale= swScale_3DNow;
1367         cpuCaps.hasMMX2 = 0;
1368 #elif defined (HAVE_MMX)
1369         swScale= swScale_MMX;
1370         cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1371 #else
1372         swScale= swScale_C;
1373         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1374 #endif
1375 #endif //!RUNTIME_CPUDETECT
1376 }
1377
1378 static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1379              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1380         uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1381         /* Copy Y plane */
1382         if(dstStride[0]==srcStride[0])
1383                 memcpy(dst, src[0], srcSliceH*dstStride[0]);
1384         else
1385         {
1386                 int i;
1387                 uint8_t *srcPtr= src[0];
1388                 uint8_t *dstPtr= dst;
1389                 for(i=0; i<srcSliceH; i++)
1390                 {
1391                         memcpy(dstPtr, srcPtr, srcStride[0]);
1392                         srcPtr+= srcStride[0];
1393                         dstPtr+= dstStride[0];
1394                 }
1395         }
1396         dst = dstParam[1] + dstStride[1]*srcSliceY;
1397         if(c->srcFormat==IMGFMT_YV12)
1398                 interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
1399         else /* I420 & IYUV */
1400                 interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
1401 }
1402
1403
1404 /* Warper functions for yuv2bgr */
1405 static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1406              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1407         uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1408
1409         if(c->srcFormat==IMGFMT_YV12)
1410                 yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1411         else /* I420 & IYUV */
1412                 yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1413 }
1414
1415 static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1416              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1417         uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1418
1419         if(c->srcFormat==IMGFMT_YV12)
1420                 yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1421         else /* I420 & IYUV */
1422                 yv12toyuy2( src[0],src[2],src[1],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1423 }
1424
1425 static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1426              int srcSliceH, uint8_t* dst[], int dstStride[]){
1427         
1428         if(dstStride[0]*3==srcStride[0]*4)
1429                 rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1430         else
1431         {
1432                 int i;
1433                 uint8_t *srcPtr= src[0];
1434                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1435
1436                 for(i=0; i<srcSliceH; i++)
1437                 {
1438                         rgb24to32(srcPtr, dstPtr, c->srcW*3);
1439                         srcPtr+= srcStride[0];
1440                         dstPtr+= dstStride[0];
1441                 }
1442         }     
1443 }
1444
1445 static void bgr24to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1446              int srcSliceH, uint8_t* dst[], int dstStride[]){
1447         
1448         if(dstStride[0]*3==srcStride[0]*2)
1449                 rgb24to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1450         else
1451         {
1452                 int i;
1453                 uint8_t *srcPtr= src[0];
1454                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1455
1456                 for(i=0; i<srcSliceH; i++)
1457                 {
1458                         rgb24to16(srcPtr, dstPtr, c->srcW*3);
1459                         srcPtr+= srcStride[0];
1460                         dstPtr+= dstStride[0];
1461                 }
1462         }     
1463 }
1464
1465 static void bgr24to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1466              int srcSliceH, uint8_t* dst[], int dstStride[]){
1467         
1468         if(dstStride[0]*3==srcStride[0]*2)
1469                 rgb24to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1470         else
1471         {
1472                 int i;
1473                 uint8_t *srcPtr= src[0];
1474                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1475
1476                 for(i=0; i<srcSliceH; i++)
1477                 {
1478                         rgb24to15(srcPtr, dstPtr, c->srcW*3);
1479                         srcPtr+= srcStride[0];
1480                         dstPtr+= dstStride[0];
1481                 }
1482         }     
1483 }
1484
1485 static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1486              int srcSliceH, uint8_t* dst[], int dstStride[]){
1487         
1488         if(dstStride[0]*4==srcStride[0]*3)
1489                 rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1490         else
1491         {
1492                 int i;
1493                 uint8_t *srcPtr= src[0];
1494                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1495
1496                 for(i=0; i<srcSliceH; i++)
1497                 {
1498                         rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1499                         srcPtr+= srcStride[0];
1500                         dstPtr+= dstStride[0];
1501                 }
1502         }     
1503 }
1504
1505 static void bgr32to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1506              int srcSliceH, uint8_t* dst[], int dstStride[]){
1507         
1508         if(dstStride[0]*4==srcStride[0]*2)
1509                 rgb32to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1510         else
1511         {
1512                 int i;
1513                 uint8_t *srcPtr= src[0];
1514                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1515
1516                 for(i=0; i<srcSliceH; i++)
1517                 {
1518                         rgb32to16(srcPtr, dstPtr, c->srcW<<2);
1519                         srcPtr+= srcStride[0];
1520                         dstPtr+= dstStride[0];
1521                 }
1522         }     
1523 }
1524
1525 static void bgr32to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1526              int srcSliceH, uint8_t* dst[], int dstStride[]){
1527         
1528         if(dstStride[0]*4==srcStride[0]*2)
1529                 rgb32to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1530         else
1531         {
1532                 int i;
1533                 uint8_t *srcPtr= src[0];
1534                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1535
1536                 for(i=0; i<srcSliceH; i++)
1537                 {
1538                         rgb32to15(srcPtr, dstPtr, c->srcW<<2);
1539                         srcPtr+= srcStride[0];
1540                         dstPtr+= dstStride[0];
1541                 }
1542         }     
1543 }
1544
1545 static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1546              int srcSliceH, uint8_t* dst[], int dstStride[]){
1547         
1548         if(dstStride[0]==srcStride[0])
1549                 rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1550         else
1551         {
1552                 int i;
1553                 uint8_t *srcPtr= src[0];
1554                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1555
1556                 for(i=0; i<srcSliceH; i++)
1557                 {
1558                         rgb15to16(srcPtr, dstPtr, c->srcW<<1);
1559                         srcPtr+= srcStride[0];
1560                         dstPtr+= dstStride[0];
1561                 }
1562         }     
1563 }
1564
1565 static void bgr15to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1566              int srcSliceH, uint8_t* dst[], int dstStride[]){
1567         
1568         if(dstStride[0]*2==srcStride[0]*3)
1569                 rgb15to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1570         else
1571         {
1572                 int i;
1573                 uint8_t *srcPtr= src[0];
1574                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1575
1576                 for(i=0; i<srcSliceH; i++)
1577                 {
1578                         rgb15to24(srcPtr, dstPtr, c->srcW<<1);
1579                         srcPtr+= srcStride[0];
1580                         dstPtr+= dstStride[0];
1581                 }
1582         }     
1583 }
1584
1585 static void bgr15to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1586              int srcSliceH, uint8_t* dst[], int dstStride[]){
1587         
1588         if(dstStride[0]*2==srcStride[0]*4)
1589                 rgb15to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1590         else
1591         {
1592                 int i;
1593                 uint8_t *srcPtr= src[0];
1594                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1595
1596                 for(i=0; i<srcSliceH; i++)
1597                 {
1598                         rgb15to32(srcPtr, dstPtr, c->srcW<<1);
1599                         srcPtr+= srcStride[0];
1600                         dstPtr+= dstStride[0];
1601                 }
1602         }     
1603 }
1604
1605 static void bgr16to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1606              int srcSliceH, uint8_t* dst[], int dstStride[]){
1607         
1608         if(dstStride[0]*2==srcStride[0]*3)
1609                 rgb16to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1610         else
1611         {
1612                 int i;
1613                 uint8_t *srcPtr= src[0];
1614                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1615
1616                 for(i=0; i<srcSliceH; i++)
1617                 {
1618                         rgb16to24(srcPtr, dstPtr, c->srcW<<1);
1619                         srcPtr+= srcStride[0];
1620                         dstPtr+= dstStride[0];
1621                 }
1622         }     
1623 }
1624
1625 static void bgr16to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1626              int srcSliceH, uint8_t* dst[], int dstStride[]){
1627         
1628         if(dstStride[0]*2==srcStride[0]*4)
1629                 rgb16to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1630         else
1631         {
1632                 int i;
1633                 uint8_t *srcPtr= src[0];
1634                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1635
1636                 for(i=0; i<srcSliceH; i++)
1637                 {
1638                         rgb16to32(srcPtr, dstPtr, c->srcW<<1);
1639                         srcPtr+= srcStride[0];
1640                         dstPtr+= dstStride[0];
1641                 }
1642         }     
1643 }
1644
1645 static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1646              int srcSliceH, uint8_t* dst[], int dstStride[]){
1647
1648         rgb24toyv12(
1649                 src[0], 
1650                 dst[0]+ srcSliceY    *dstStride[0], 
1651                 dst[1]+(srcSliceY>>1)*dstStride[1], 
1652                 dst[2]+(srcSliceY>>1)*dstStride[2],
1653                 c->srcW, srcSliceH, 
1654                 dstStride[0], dstStride[1], srcStride[0]);
1655 }
1656
1657 /**
1658  * bring pointers in YUV order instead of YVU
1659  */
1660 static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
1661         if(format == IMGFMT_YV12 || format == IMGFMT_YVU9){
1662                 sortedP[0]= p[0];
1663                 sortedP[1]= p[1];
1664                 sortedP[2]= p[2];
1665                 sortedStride[0]= stride[0];
1666                 sortedStride[1]= stride[1];
1667                 sortedStride[2]= stride[2];
1668         }
1669         else if(isPacked(format) || isGray(format))
1670         {
1671                 sortedP[0]= p[0];
1672                 sortedP[1]= 
1673                 sortedP[2]= NULL;
1674                 sortedStride[0]= stride[0];
1675                 sortedStride[1]= 
1676                 sortedStride[2]= 0;
1677         }
1678         else /* I420 */
1679         {
1680                 sortedP[0]= p[0];
1681                 sortedP[1]= p[2];
1682                 sortedP[2]= p[1];
1683                 sortedStride[0]= stride[0];
1684                 sortedStride[1]= stride[2];
1685                 sortedStride[2]= stride[1];
1686         }
1687 }
1688
1689 /* unscaled copy like stuff (assumes nearly identical formats) */
1690 static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1691              int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
1692
1693         int srcStride[3];
1694         int dstStride[3];
1695         uint8_t *src[3];
1696         uint8_t *dst[3];
1697
1698         orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
1699         orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
1700
1701         if(isPacked(c->srcFormat))
1702         {
1703                 if(dstStride[0]==srcStride[0])
1704                         memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1705                 else
1706                 {
1707                         int i;
1708                         uint8_t *srcPtr= src[0];
1709                         uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1710                         int length=0;
1711
1712                         /* universal length finder */
1713                         while(length+c->srcW <= ABS(dstStride[0]) 
1714                            && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
1715                         ASSERT(length!=0);
1716
1717                         for(i=0; i<srcSliceH; i++)
1718                         {
1719                                 memcpy(dstPtr, srcPtr, length);
1720                                 srcPtr+= srcStride[0];
1721                                 dstPtr+= dstStride[0];
1722                         }
1723                 }
1724         }
1725         else 
1726         { /* Planar YUV or gray */
1727                 int plane;
1728                 for(plane=0; plane<3; plane++)
1729                 {
1730                         int length= plane==0 ? c->srcW  : -((-c->srcW  )>>c->chrDstHSubSample);
1731                         int y=      plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
1732                         int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
1733
1734                         if((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0)
1735                         {
1736                                 if(!isGray(c->dstFormat))
1737                                         memset(dst[plane], 128, dstStride[plane]*height);
1738                         }
1739                         else
1740                         {
1741                                 if(dstStride[plane]==srcStride[plane])
1742                                         memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1743                                 else
1744                                 {
1745                                         int i;
1746                                         uint8_t *srcPtr= src[plane];
1747                                         uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1748                                         for(i=0; i<height; i++)
1749                                         {
1750                                                 memcpy(dstPtr, srcPtr, length);
1751                                                 srcPtr+= srcStride[plane];
1752                                                 dstPtr+= dstStride[plane];
1753                                         }
1754                                 }
1755                         }
1756                 }
1757         }
1758 }
1759
1760 static int remove_dup_fourcc(int fourcc)
1761 {
1762         switch(fourcc)
1763         {
1764             case IMGFMT_IYUV: return IMGFMT_I420;
1765             case IMGFMT_Y8  : return IMGFMT_Y800;
1766             case IMGFMT_IF09: return IMGFMT_YVU9;
1767             default: return fourcc;
1768         }
1769 }
1770
1771 static void getSubSampleFactors(int *h, int *v, int format){
1772         switch(format){
1773         case IMGFMT_YUY2:
1774                 *h=1;
1775                 *v=0;
1776                 break;
1777         case IMGFMT_YV12:
1778         case IMGFMT_I420:
1779         case IMGFMT_Y800: //FIXME remove after different subsamplings are fully implemented
1780                 *h=1;
1781                 *v=1;
1782                 break;
1783         case IMGFMT_YVU9:
1784                 *h=2;
1785                 *v=2;
1786                 break;
1787         default:
1788                 *h=0;
1789                 *v=0;
1790                 break;
1791         }
1792 }
1793
1794 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1795                          SwsFilter *srcFilter, SwsFilter *dstFilter){
1796
1797         SwsContext *c;
1798         int i;
1799         int usesFilter;
1800         int unscaled;
1801         SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1802 #ifdef ARCH_X86
1803         if(gCpuCaps.hasMMX)
1804                 asm volatile("emms\n\t"::: "memory");
1805 #endif
1806         if(swScale==NULL) globalInit();
1807 //srcFormat= IMGFMT_Y800;
1808 //srcFormat= IMGFMT_YVU9;
1809         /* avoid dupplicate Formats, so we dont need to check to much */
1810         srcFormat = remove_dup_fourcc(srcFormat);
1811         dstFormat = remove_dup_fourcc(dstFormat);
1812
1813         unscaled = (srcW == dstW && srcH == dstH);
1814
1815         if(!isSupportedIn(srcFormat)) 
1816         {
1817                 MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1818                 return NULL;
1819         }
1820         if(!isSupportedOut(dstFormat))
1821         {
1822                 MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1823                 return NULL;
1824         }
1825
1826         /* sanity check */
1827         if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1828         {
1829                  MSG_ERR("swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
1830                         srcW, srcH, dstW, dstH);
1831                 return NULL;
1832         }
1833
1834         if(!dstFilter) dstFilter= &dummyFilter;
1835         if(!srcFilter) srcFilter= &dummyFilter;
1836
1837         c= memalign(64, sizeof(SwsContext));
1838         memset(c, 0, sizeof(SwsContext));
1839
1840         c->srcW= srcW;
1841         c->srcH= srcH;
1842         c->dstW= dstW;
1843         c->dstH= dstH;
1844         c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1845         c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1846         c->flags= flags;
1847         c->dstFormat= dstFormat;
1848         c->srcFormat= srcFormat;
1849
1850         usesFilter=0;
1851         if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1852         if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1853         if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1854         if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1855         if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1856         if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1857         if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1858         if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1859
1860         getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
1861         getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
1862
1863         // reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
1864         if((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
1865
1866         // drop eery 2. pixel for chroma calculation unless user wants full chroma
1867         if((isBGR(srcFormat) || isRGB(srcFormat) || srcFormat==IMGFMT_YUY2) && !(flags&SWS_FULL_CHR_V)) 
1868                 c->chrSrcVSubSample=1;
1869
1870         // drop eery 2. pixel for chroma calculation unless user wants full chroma
1871         if((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP)) 
1872                 c->chrSrcHSubSample=1;
1873
1874         c->chrIntHSubSample= c->chrDstHSubSample;
1875         c->chrIntVSubSample= c->chrSrcVSubSample;
1876         
1877         // note the -((-x)>>y) is so that we allways round toward +inf
1878         c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
1879         c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
1880         c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
1881         c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
1882 /*      printf("%d %d %d %d / %d %d %d %d //\n", 
1883         c->chrSrcW,
1884 c->chrSrcH,
1885 c->chrDstW,
1886 c->chrDstH,
1887 srcW,
1888 srcH,
1889 dstW,
1890 dstH);*/
1891         
1892         /* unscaled special Cases */
1893         if(unscaled && !usesFilter)
1894         {
1895                 /* yv12_to_nv12 */
1896                 if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_NV12)
1897                 {
1898                         c->swScale= PlanarToNV12Wrapper;
1899
1900                         if(flags&SWS_PRINT_INFO)
1901                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1902                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1903                         return c;
1904                 }
1905                 /* yv12_to_yuy2 */
1906                 if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_YUY2)
1907                 {
1908                         c->swScale= PlanarToYuy2Wrapper;
1909
1910                         if(flags&SWS_PRINT_INFO)
1911                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1912                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1913                         return c;
1914                 }
1915                 /* yuv2bgr */
1916                 if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420) && isBGR(dstFormat))
1917                 {
1918                         // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1919 #ifdef WORDS_BIGENDIAN
1920                         if(dstFormat==IMGFMT_BGR32)
1921                                 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1922                         else
1923                                 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1924 #else
1925                         yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1926 #endif
1927                         c->swScale= planarYuvToBgr;
1928
1929                         if(flags&SWS_PRINT_INFO)
1930                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1931                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1932                         return c;
1933                 }
1934 #if 1
1935                 /* simple copy */
1936                 if(   srcFormat == dstFormat
1937                    || (srcFormat==IMGFMT_YV12 && dstFormat==IMGFMT_I420)
1938                    || (srcFormat==IMGFMT_I420 && dstFormat==IMGFMT_YV12)
1939                    || (isPlanarYUV(srcFormat) && isGray(dstFormat))
1940                    || (isPlanarYUV(dstFormat) && isGray(srcFormat))
1941                   )
1942                 {
1943                         c->swScale= simpleCopy;
1944
1945                         if(flags&SWS_PRINT_INFO)
1946                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1947                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1948                         return c;
1949                 }
1950 #endif
1951                 /* bgr32to24 & rgb32to24*/
1952                 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1953                  ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1954                 {
1955                         c->swScale= bgr32to24Wrapper;
1956
1957                         if(flags&SWS_PRINT_INFO)
1958                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1959                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1960                         return c;
1961                 }
1962
1963                 /* bgr32to16 & rgb32to16*/
1964                 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR16)
1965                  ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB16))
1966                 {
1967                         c->swScale= bgr32to16Wrapper;
1968
1969                         if(flags&SWS_PRINT_INFO)
1970                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1971                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1972                         return c;
1973                 }
1974
1975                 /* bgr32to15 & rgb32to15*/
1976                 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR15)
1977                  ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB15))
1978                 {
1979                         c->swScale= bgr32to15Wrapper;
1980
1981                         if(flags&SWS_PRINT_INFO)
1982                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1983                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1984                         return c;
1985                 }
1986
1987                 /* bgr24to32 & rgb24to32*/
1988                 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1989                  ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1990                 {
1991                         c->swScale= bgr24to32Wrapper;
1992
1993                         if(flags&SWS_PRINT_INFO)
1994                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
1995                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1996                         return c;
1997                 }
1998
1999                 /* bgr24to16 & rgb24to16*/
2000                 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR16)
2001                  ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB16))
2002                 {
2003                         c->swScale= bgr24to16Wrapper;
2004
2005                         if(flags&SWS_PRINT_INFO)
2006                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2007                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2008                         return c;
2009                 }
2010
2011                 /* bgr24to15 & rgb24to15*/
2012                 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR15)
2013                  ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB15))
2014                 {
2015                         c->swScale= bgr24to15Wrapper;
2016
2017                         if(flags&SWS_PRINT_INFO)
2018                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2019                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2020                         return c;
2021                 }
2022
2023                 /* bgr15to16 */
2024                 if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
2025                 {
2026                         c->swScale= bgr15to16Wrapper;
2027
2028                         if(flags&SWS_PRINT_INFO)
2029                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2030                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2031                         return c;
2032                 }
2033
2034                 /* bgr15to24 */
2035                 if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR24)
2036                  ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB24))
2037                 {
2038                         c->swScale= bgr15to24Wrapper;
2039
2040                         if(flags&SWS_PRINT_INFO)
2041                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2042                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2043                         return c;
2044                 }
2045
2046 #if 0 //segfaults
2047                 /* bgr15to32 */
2048                 if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR32)
2049                  ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB32))
2050                 {
2051                         c->swScale= bgr15to32Wrapper;
2052
2053                         if(flags&SWS_PRINT_INFO)
2054                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2055                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2056                         return c;
2057                 }
2058 #endif
2059                 /* bgr16to24 */
2060                 if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR24)
2061                  ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB24))
2062                 {
2063                         c->swScale= bgr16to24Wrapper;
2064
2065                         if(flags&SWS_PRINT_INFO)
2066                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2067                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2068                         return c;
2069                 }
2070
2071 #if 0 //segfaults
2072                 /* bgr16to32 */
2073                 if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR32)
2074                  ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB32))
2075                 {
2076                         c->swScale= bgr16to32Wrapper;
2077
2078                         if(flags&SWS_PRINT_INFO)
2079                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2080                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2081                         return c;
2082                 }
2083 #endif
2084                 /* bgr24toYV12 */
2085                 if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
2086                 {
2087                         c->swScale= bgr24toyv12Wrapper;
2088
2089                         if(flags&SWS_PRINT_INFO)
2090                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n", 
2091                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2092                         return c;
2093                 }
2094         }
2095
2096         if(cpuCaps.hasMMX2)
2097         {
2098                 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2099                 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
2100                 {
2101                         if(flags&SWS_PRINT_INFO)
2102                                 MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2103                 }
2104         }
2105         else
2106                 c->canMMX2BeUsed=0;
2107
2108         c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
2109         c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
2110
2111         // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2112         // but only for the FAST_BILINEAR mode otherwise do correct scaling
2113         // n-2 is the last chrominance sample available
2114         // this is not perfect, but noone shuld notice the difference, the more correct variant
2115         // would be like the vertical one, but that would require some special code for the
2116         // first and last pixel
2117         if(flags&SWS_FAST_BILINEAR)
2118         {
2119                 if(c->canMMX2BeUsed)
2120                 {
2121                         c->lumXInc+= 20;
2122                         c->chrXInc+= 20;
2123                 }
2124                 //we dont use the x86asm scaler if mmx is available
2125                 else if(cpuCaps.hasMMX)
2126                 {
2127                         c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2128                         c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
2129                 }
2130         }
2131
2132         /* precalculate horizontal scaler filter coefficients */
2133         {
2134                 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
2135
2136                 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
2137                                  srcW      ,       dstW, filterAlign, 1<<14, flags,
2138                                  srcFilter->lumH, dstFilter->lumH);
2139                 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
2140                                  c->chrSrcW, c->chrDstW, filterAlign, 1<<14, flags,
2141                                  srcFilter->chrH, dstFilter->chrH);
2142
2143 #ifdef ARCH_X86
2144 // cant downscale !!!
2145                 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
2146                 {
2147                         c->lumMmx2Filter   = (int16_t*)memalign(8, (dstW        /8+8)*sizeof(int16_t));
2148                         c->chrMmx2Filter   = (int16_t*)memalign(8, (c->chrDstW  /4+8)*sizeof(int16_t));
2149                         c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW      /2/8+8)*sizeof(int32_t));
2150                         c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
2151
2152                         initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
2153                         initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
2154                 }
2155 #endif
2156         } // Init Horizontal stuff
2157
2158
2159
2160         /* precalculate vertical scaler filter coefficients */
2161         initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
2162                         srcH      ,        dstH, 1, (1<<12)-4, flags,
2163                         srcFilter->lumV, dstFilter->lumV);
2164         initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
2165                         c->chrSrcH, c->chrDstH, 1, (1<<12)-4, flags,
2166                          srcFilter->chrV, dstFilter->chrV);
2167
2168         // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2169         c->vLumBufSize= c->vLumFilterSize;
2170         c->vChrBufSize= c->vChrFilterSize;
2171         for(i=0; i<dstH; i++)
2172         {
2173                 int chrI= i*c->chrDstH / dstH;
2174                 int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
2175                                  ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
2176                 nextSlice&= ~3; // Slices start at boundaries which are divisable through 4
2177                 if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
2178                         c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
2179                 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample))
2180                         c->vChrBufSize= (nextSlice>>c->chrSrcVSubSample) - c->vChrFilterPos[chrI];
2181         }
2182
2183         // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2184         c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
2185         c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
2186         //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
2187         for(i=0; i<c->vLumBufSize; i++)
2188                 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
2189         for(i=0; i<c->vChrBufSize; i++)
2190                 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
2191
2192         //try to avoid drawing green stuff between the right end and the stride end
2193         for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
2194         for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
2195
2196         ASSERT(c->chrDstH <= dstH)
2197
2198         // pack filter data for mmx code
2199         if(cpuCaps.hasMMX)
2200         {
2201                 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
2202                 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
2203                 for(i=0; i<c->vLumFilterSize*dstH; i++)
2204                         c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
2205                                 c->vLumFilter[i];
2206                 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
2207                         c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
2208                                 c->vChrFilter[i];
2209         }
2210
2211         if(flags&SWS_PRINT_INFO)
2212         {
2213 #ifdef DITHER1XBPP
2214                 char *dither= " dithered";
2215 #else
2216                 char *dither= "";
2217 #endif
2218                 if(flags&SWS_FAST_BILINEAR)
2219                         MSG_INFO("\nSwScaler: FAST_BILINEAR scaler, ");
2220                 else if(flags&SWS_BILINEAR)
2221                         MSG_INFO("\nSwScaler: BILINEAR scaler, ");
2222                 else if(flags&SWS_BICUBIC)
2223                         MSG_INFO("\nSwScaler: BICUBIC scaler, ");
2224                 else if(flags&SWS_X)
2225                         MSG_INFO("\nSwScaler: Experimental scaler, ");
2226                 else if(flags&SWS_POINT)
2227                         MSG_INFO("\nSwScaler: Nearest Neighbor / POINT scaler, ");
2228                 else if(flags&SWS_AREA)
2229                         MSG_INFO("\nSwScaler: Area Averageing scaler, ");
2230                 else
2231                         MSG_INFO("\nSwScaler: ehh flags invalid?! ");
2232
2233                 if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
2234                         MSG_INFO("from %s to%s %s ", 
2235                                 vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
2236                 else
2237                         MSG_INFO("from %s to %s ", 
2238                                 vo_format_name(srcFormat), vo_format_name(dstFormat));
2239
2240                 if(cpuCaps.hasMMX2)
2241                         MSG_INFO("using MMX2\n");
2242                 else if(cpuCaps.has3DNow)
2243                         MSG_INFO("using 3DNOW\n");
2244                 else if(cpuCaps.hasMMX)
2245                         MSG_INFO("using MMX\n");
2246                 else
2247                         MSG_INFO("using C\n");
2248         }
2249
2250         if((flags & SWS_PRINT_INFO) && verbose)
2251         {
2252                 if(cpuCaps.hasMMX)
2253                 {
2254                         if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
2255                                 MSG_V("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2256                         else
2257                         {
2258                                 if(c->hLumFilterSize==4)
2259                                         MSG_V("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2260                                 else if(c->hLumFilterSize==8)
2261                                         MSG_V("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2262                                 else
2263                                         MSG_V("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2264
2265                                 if(c->hChrFilterSize==4)
2266                                         MSG_V("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2267                                 else if(c->hChrFilterSize==8)
2268                                         MSG_V("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2269                                 else
2270                                         MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2271                         }
2272                 }
2273                 else
2274                 {
2275 #ifdef ARCH_X86
2276                         MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2277 #else
2278                         if(flags & SWS_FAST_BILINEAR)
2279                                 MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2280                         else
2281                                 MSG_V("SwScaler: using C scaler for horizontal scaling\n");
2282 #endif
2283                 }
2284                 if(isPlanarYUV(dstFormat))
2285                 {
2286                         if(c->vLumFilterSize==1)
2287                                 MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
2288                         else
2289                                 MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
2290                 }
2291                 else
2292                 {
2293                         if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
2294                                 MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2295                                        "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
2296                         else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
2297                                 MSG_V("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
2298                         else
2299                                 MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
2300                 }
2301
2302                 if(dstFormat==IMGFMT_BGR24)
2303                         MSG_V("SwScaler: using %s YV12->BGR24 Converter\n",
2304                                 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
2305                 else if(dstFormat==IMGFMT_BGR32)
2306                         MSG_V("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2307                 else if(dstFormat==IMGFMT_BGR16)
2308                         MSG_V("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2309                 else if(dstFormat==IMGFMT_BGR15)
2310                         MSG_V("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2311
2312                 MSG_V("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2313         }
2314         if((flags & SWS_PRINT_INFO) && verbose>1)
2315         {
2316                 MSG_DBG2("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2317                         c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
2318                 MSG_DBG2("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2319                         c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
2320         }
2321
2322         c->swScale= swScale;
2323         return c;
2324 }
2325
2326 /**
2327  * returns a normalized gaussian curve used to filter stuff
2328  * quality=3 is high quality, lowwer is lowwer quality
2329  */
2330
2331 SwsVector *getGaussianVec(double variance, double quality){
2332         const int length= (int)(variance*quality + 0.5) | 1;
2333         int i;
2334         double *coeff= memalign(sizeof(double), length*sizeof(double));
2335         double middle= (length-1)*0.5;
2336         SwsVector *vec= malloc(sizeof(SwsVector));
2337
2338         vec->coeff= coeff;
2339         vec->length= length;
2340
2341         for(i=0; i<length; i++)
2342         {
2343                 double dist= i-middle;
2344                 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
2345         }
2346
2347         normalizeVec(vec, 1.0);
2348
2349         return vec;
2350 }
2351
2352 SwsVector *getConstVec(double c, int length){
2353         int i;
2354         double *coeff= memalign(sizeof(double), length*sizeof(double));
2355         SwsVector *vec= malloc(sizeof(SwsVector));
2356
2357         vec->coeff= coeff;
2358         vec->length= length;
2359
2360         for(i=0; i<length; i++)
2361                 coeff[i]= c;
2362
2363         return vec;
2364 }
2365
2366
2367 SwsVector *getIdentityVec(void){
2368         double *coeff= memalign(sizeof(double), sizeof(double));
2369         SwsVector *vec= malloc(sizeof(SwsVector));
2370         coeff[0]= 1.0;
2371
2372         vec->coeff= coeff;
2373         vec->length= 1;
2374
2375         return vec;
2376 }
2377
2378 void normalizeVec(SwsVector *a, double height){
2379         int i;
2380         double sum=0;
2381         double inv;
2382
2383         for(i=0; i<a->length; i++)
2384                 sum+= a->coeff[i];
2385
2386         inv= height/sum;
2387
2388         for(i=0; i<a->length; i++)
2389                 a->coeff[i]*= height;
2390 }
2391
2392 void scaleVec(SwsVector *a, double scalar){
2393         int i;
2394
2395         for(i=0; i<a->length; i++)
2396                 a->coeff[i]*= scalar;
2397 }
2398
2399 static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
2400         int length= a->length + b->length - 1;
2401         double *coeff= memalign(sizeof(double), length*sizeof(double));
2402         int i, j;
2403         SwsVector *vec= malloc(sizeof(SwsVector));
2404
2405         vec->coeff= coeff;
2406         vec->length= length;
2407
2408         for(i=0; i<length; i++) coeff[i]= 0.0;
2409
2410         for(i=0; i<a->length; i++)
2411         {
2412                 for(j=0; j<b->length; j++)
2413                 {
2414                         coeff[i+j]+= a->coeff[i]*b->coeff[j];
2415                 }
2416         }
2417
2418         return vec;
2419 }
2420
2421 static SwsVector *sumVec(SwsVector *a, SwsVector *b){
2422         int length= MAX(a->length, b->length);
2423         double *coeff= memalign(sizeof(double), length*sizeof(double));
2424         int i;
2425         SwsVector *vec= malloc(sizeof(SwsVector));
2426
2427         vec->coeff= coeff;
2428         vec->length= length;
2429
2430         for(i=0; i<length; i++) coeff[i]= 0.0;
2431
2432         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2433         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
2434
2435         return vec;
2436 }
2437
2438 static SwsVector *diffVec(SwsVector *a, SwsVector *b){
2439         int length= MAX(a->length, b->length);
2440         double *coeff= memalign(sizeof(double), length*sizeof(double));
2441         int i;
2442         SwsVector *vec= malloc(sizeof(SwsVector));
2443
2444         vec->coeff= coeff;
2445         vec->length= length;
2446
2447         for(i=0; i<length; i++) coeff[i]= 0.0;
2448
2449         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2450         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
2451
2452         return vec;
2453 }
2454
2455 /* shift left / or right if "shift" is negative */
2456 static SwsVector *getShiftedVec(SwsVector *a, int shift){
2457         int length= a->length + ABS(shift)*2;
2458         double *coeff= memalign(sizeof(double), length*sizeof(double));
2459         int i;
2460         SwsVector *vec= malloc(sizeof(SwsVector));
2461
2462         vec->coeff= coeff;
2463         vec->length= length;
2464
2465         for(i=0; i<length; i++) coeff[i]= 0.0;
2466
2467         for(i=0; i<a->length; i++)
2468         {
2469                 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
2470         }
2471
2472         return vec;
2473 }
2474
2475 void shiftVec(SwsVector *a, int shift){
2476         SwsVector *shifted= getShiftedVec(a, shift);
2477         free(a->coeff);
2478         a->coeff= shifted->coeff;
2479         a->length= shifted->length;
2480         free(shifted);
2481 }
2482
2483 void addVec(SwsVector *a, SwsVector *b){
2484         SwsVector *sum= sumVec(a, b);
2485         free(a->coeff);
2486         a->coeff= sum->coeff;
2487         a->length= sum->length;
2488         free(sum);
2489 }
2490
2491 void subVec(SwsVector *a, SwsVector *b){
2492         SwsVector *diff= diffVec(a, b);
2493         free(a->coeff);
2494         a->coeff= diff->coeff;
2495         a->length= diff->length;
2496         free(diff);
2497 }
2498
2499 void convVec(SwsVector *a, SwsVector *b){
2500         SwsVector *conv= getConvVec(a, b);
2501         free(a->coeff);
2502         a->coeff= conv->coeff;
2503         a->length= conv->length;
2504         free(conv);
2505 }
2506
2507 SwsVector *cloneVec(SwsVector *a){
2508         double *coeff= memalign(sizeof(double), a->length*sizeof(double));
2509         int i;
2510         SwsVector *vec= malloc(sizeof(SwsVector));
2511
2512         vec->coeff= coeff;
2513         vec->length= a->length;
2514
2515         for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
2516
2517         return vec;
2518 }
2519
2520 void printVec(SwsVector *a){
2521         int i;
2522         double max=0;
2523         double min=0;
2524         double range;
2525
2526         for(i=0; i<a->length; i++)
2527                 if(a->coeff[i]>max) max= a->coeff[i];
2528
2529         for(i=0; i<a->length; i++)
2530                 if(a->coeff[i]<min) min= a->coeff[i];
2531
2532         range= max - min;
2533
2534         for(i=0; i<a->length; i++)
2535         {
2536                 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
2537                 MSG_DBG2("%1.3f ", a->coeff[i]);
2538                 for(;x>0; x--) MSG_DBG2(" ");
2539                 MSG_DBG2("|\n");
2540         }
2541 }
2542
2543 void freeVec(SwsVector *a){
2544         if(!a) return;
2545         if(a->coeff) free(a->coeff);
2546         a->coeff=NULL;
2547         a->length=0;
2548         free(a);
2549 }
2550
2551 void freeSwsContext(SwsContext *c){
2552         int i;
2553         if(!c) return;
2554
2555         if(c->lumPixBuf)
2556         {
2557                 for(i=0; i<c->vLumBufSize; i++)
2558                 {
2559                         if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
2560                         c->lumPixBuf[i]=NULL;
2561                 }
2562                 free(c->lumPixBuf);
2563                 c->lumPixBuf=NULL;
2564         }
2565
2566         if(c->chrPixBuf)
2567         {
2568                 for(i=0; i<c->vChrBufSize; i++)
2569                 {
2570                         if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
2571                         c->chrPixBuf[i]=NULL;
2572                 }
2573                 free(c->chrPixBuf);
2574                 c->chrPixBuf=NULL;
2575         }
2576
2577         if(c->vLumFilter) free(c->vLumFilter);
2578         c->vLumFilter = NULL;
2579         if(c->vChrFilter) free(c->vChrFilter);
2580         c->vChrFilter = NULL;
2581         if(c->hLumFilter) free(c->hLumFilter);
2582         c->hLumFilter = NULL;
2583         if(c->hChrFilter) free(c->hChrFilter);
2584         c->hChrFilter = NULL;
2585
2586         if(c->vLumFilterPos) free(c->vLumFilterPos);
2587         c->vLumFilterPos = NULL;
2588         if(c->vChrFilterPos) free(c->vChrFilterPos);
2589         c->vChrFilterPos = NULL;
2590         if(c->hLumFilterPos) free(c->hLumFilterPos);
2591         c->hLumFilterPos = NULL;
2592         if(c->hChrFilterPos) free(c->hChrFilterPos);
2593         c->hChrFilterPos = NULL;
2594
2595         if(c->lumMmxFilter) free(c->lumMmxFilter);
2596         c->lumMmxFilter = NULL;
2597         if(c->chrMmxFilter) free(c->chrMmxFilter);
2598         c->chrMmxFilter = NULL;
2599
2600         if(c->lumMmx2Filter) free(c->lumMmx2Filter);
2601         c->lumMmx2Filter=NULL;
2602         if(c->chrMmx2Filter) free(c->chrMmx2Filter);
2603         c->chrMmx2Filter=NULL;
2604         if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
2605         c->lumMmx2FilterPos=NULL;
2606         if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
2607         c->chrMmx2FilterPos=NULL;
2608
2609         free(c);
2610 }
2611
2612