]> git.sesse.net Git - ffmpeg/blob - postproc/swscale.c
-sws 2 is default now
[ffmpeg] / postproc / swscale.c
1 /*
2     Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20   supported Input formats: YV12, I420, IYUV (grayscale soon too)
21   supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
22   BGR15/16 support dithering
23 */
24
25 #include <inttypes.h>
26 #include <string.h>
27 #include <math.h>
28 #include <stdio.h>
29 #include "../config.h"
30 #include "../mangle.h"
31 #ifdef HAVE_MALLOC_H
32 #include <malloc.h>
33 #endif
34 #include "swscale.h"
35 #include "../cpudetect.h"
36 #include "../libvo/img_format.h"
37 #undef MOVNTQ
38 #undef PAVGB
39
40 //#undef HAVE_MMX2
41 //#define HAVE_3DNOW
42 //#undef HAVE_MMX
43 //#undef ARCH_X86
44 #define DITHER1XBPP
45
46 #define RET 0xC3 //near return opcode
47
48 #ifdef MP_DEBUG
49 #define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
50 #else
51 #define ASSERT(x) ;
52 #endif
53
54 #ifdef M_PI
55 #define PI M_PI
56 #else
57 #define PI 3.14159265358979323846
58 #endif
59
60 //FIXME replace this with something faster
61 #define isYUV(x)       ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
62 #define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
63 #define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
64 #define isHalfChrH(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
65
66 extern int verbose; // defined in mplayer.c
67 /*
68 NOTES
69
70 known BUGS with known cause (no bugreports please!, but patches are welcome :) )
71 horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
72
73 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
74
75 TODO
76 more intelligent missalignment avoidance for the horizontal scaler
77 change the distance of the u & v buffer
78 write special vertical cubic upscale version
79 Optimize C code (yv12 / minmax)
80 add support for packed pixel yuv input & output
81 add support for Y8 input & output
82 add BGR4 output support
83 add BGR32 / BGR24 input support
84 */
85
86 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
87 #define MIN(a,b) ((a) > (b) ? (b) : (a))
88 #define MAX(a,b) ((a) < (b) ? (b) : (a))
89
90 #ifdef ARCH_X86
91 #define CAN_COMPILE_X86_ASM
92 #endif
93
94 #ifdef CAN_COMPILE_X86_ASM
95 static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
96 static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
97 static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
98 static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
99 static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
100 static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
101 static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
102 static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
103 static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
104 static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
105 static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
106 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
107 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
108 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
109
110 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
111 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
112 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
113 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
114
115 static uint64_t __attribute__((aligned(8))) dither4[2]={
116         0x0103010301030103LL,
117         0x0200020002000200LL,};
118
119 static uint64_t __attribute__((aligned(8))) dither8[2]={
120         0x0602060206020602LL,
121         0x0004000400040004LL,};
122
123 static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
124 static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
125 static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
126 static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
127 static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
128 static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
129
130 static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
131 static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
132 static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
133
134 // FIXME remove
135 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
136 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
137 #endif
138
139 // clipping helper table for C implementations:
140 static unsigned char clip_table[768];
141
142 static unsigned short clip_table16b[768];
143 static unsigned short clip_table16g[768];
144 static unsigned short clip_table16r[768];
145 static unsigned short clip_table15b[768];
146 static unsigned short clip_table15g[768];
147 static unsigned short clip_table15r[768];
148
149 // yuv->rgb conversion tables:
150 static    int yuvtab_2568[256];
151 static    int yuvtab_3343[256];
152 static    int yuvtab_0c92[256];
153 static    int yuvtab_1a1e[256];
154 static    int yuvtab_40cf[256];
155 // Needed for cubic scaler to catch overflows
156 static    int clip_yuvtab_2568[768];
157 static    int clip_yuvtab_3343[768];
158 static    int clip_yuvtab_0c92[768];
159 static    int clip_yuvtab_1a1e[768];
160 static    int clip_yuvtab_40cf[768];
161
162 //global sws_flags from the command line
163 int sws_flags=2;
164
165 //global srcFilter
166 SwsFilter src_filter= {NULL, NULL, NULL, NULL};
167
168 float sws_lum_gblur= 0.0;
169 float sws_chr_gblur= 0.0;
170 int sws_chr_vshift= 0;
171 int sws_chr_hshift= 0;
172 float sws_chr_sharpen= 0.0;
173 float sws_lum_sharpen= 0.0;
174
175 /* cpuCaps combined from cpudetect and whats actually compiled in
176    (if there is no support for something compiled in it wont appear here) */
177 static CpuCaps cpuCaps;
178
179 void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
180              int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
181
182 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
183
184 #ifdef CAN_COMPILE_X86_ASM
185 void in_asm_used_var_warning_killer()
186 {
187  volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
188  bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
189  M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0];
190  if(i) i=0;
191 }
192 #endif
193
194 static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
195                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
196                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
197 {
198         //FIXME Optimize (just quickly writen not opti..)
199         int i;
200         for(i=0; i<dstW; i++)
201         {
202                 int val=0;
203                 int j;
204                 for(j=0; j<lumFilterSize; j++)
205                         val += lumSrc[j][i] * lumFilter[j];
206
207                 dest[i]= MIN(MAX(val>>19, 0), 255);
208         }
209
210         if(uDest != NULL)
211                 for(i=0; i<(dstW>>1); i++)
212                 {
213                         int u=0;
214                         int v=0;
215                         int j;
216                         for(j=0; j<chrFilterSize; j++)
217                         {
218                                 u += chrSrc[j][i] * chrFilter[j];
219                                 v += chrSrc[j][i + 2048] * chrFilter[j];
220                         }
221
222                         uDest[i]= MIN(MAX(u>>19, 0), 255);
223                         vDest[i]= MIN(MAX(v>>19, 0), 255);
224                 }
225 }
226
227 static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
228                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
229                                     uint8_t *dest, int dstW, int dstFormat)
230 {
231         if(dstFormat==IMGFMT_BGR32)
232         {
233                 int i;
234                 for(i=0; i<(dstW>>1); i++){
235                         int j;
236                         int Y1=0;
237                         int Y2=0;
238                         int U=0;
239                         int V=0;
240                         int Cb, Cr, Cg;
241                         for(j=0; j<lumFilterSize; j++)
242                         {
243                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
244                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
245                         }
246                         for(j=0; j<chrFilterSize; j++)
247                         {
248                                 U += chrSrc[j][i] * chrFilter[j];
249                                 V += chrSrc[j][i+2048] * chrFilter[j];
250                         }
251                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
252                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
253                         U >>= 19;
254                         V >>= 19;
255
256                         Cb= clip_yuvtab_40cf[U+ 256];
257                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
258                         Cr= clip_yuvtab_3343[V+ 256];
259
260                         dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
261                         dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
262                         dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
263
264                         dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
265                         dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
266                         dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
267                 }
268         }
269         else if(dstFormat==IMGFMT_BGR24)
270         {
271                 int i;
272                 for(i=0; i<(dstW>>1); i++){
273                         int j;
274                         int Y1=0;
275                         int Y2=0;
276                         int U=0;
277                         int V=0;
278                         int Cb, Cr, Cg;
279                         for(j=0; j<lumFilterSize; j++)
280                         {
281                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
282                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
283                         }
284                         for(j=0; j<chrFilterSize; j++)
285                         {
286                                 U += chrSrc[j][i] * chrFilter[j];
287                                 V += chrSrc[j][i+2048] * chrFilter[j];
288                         }
289                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
290                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
291                         U >>= 19;
292                         V >>= 19;
293
294                         Cb= clip_yuvtab_40cf[U+ 256];
295                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
296                         Cr= clip_yuvtab_3343[V+ 256];
297
298                         dest[0]=clip_table[((Y1 + Cb) >>13)];
299                         dest[1]=clip_table[((Y1 + Cg) >>13)];
300                         dest[2]=clip_table[((Y1 + Cr) >>13)];
301
302                         dest[3]=clip_table[((Y2 + Cb) >>13)];
303                         dest[4]=clip_table[((Y2 + Cg) >>13)];
304                         dest[5]=clip_table[((Y2 + Cr) >>13)];
305                         dest+=6;
306                 }
307         }
308         else if(dstFormat==IMGFMT_BGR16)
309         {
310                 int i;
311 #ifdef DITHER1XBPP
312                 static int ditherb1=1<<14;
313                 static int ditherg1=1<<13;
314                 static int ditherr1=2<<14;
315                 static int ditherb2=3<<14;
316                 static int ditherg2=3<<13;
317                 static int ditherr2=0<<14;
318
319                 ditherb1 ^= (1^2)<<14;
320                 ditherg1 ^= (1^2)<<13;
321                 ditherr1 ^= (1^2)<<14;
322                 ditherb2 ^= (3^0)<<14;
323                 ditherg2 ^= (3^0)<<13;
324                 ditherr2 ^= (3^0)<<14;
325 #else
326                 const int ditherb1=0;
327                 const int ditherg1=0;
328                 const int ditherr1=0;
329                 const int ditherb2=0;
330                 const int ditherg2=0;
331                 const int ditherr2=0;
332 #endif
333                 for(i=0; i<(dstW>>1); i++){
334                         int j;
335                         int Y1=0;
336                         int Y2=0;
337                         int U=0;
338                         int V=0;
339                         int Cb, Cr, Cg;
340                         for(j=0; j<lumFilterSize; j++)
341                         {
342                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
343                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
344                         }
345                         for(j=0; j<chrFilterSize; j++)
346                         {
347                                 U += chrSrc[j][i] * chrFilter[j];
348                                 V += chrSrc[j][i+2048] * chrFilter[j];
349                         }
350                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
351                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
352                         U >>= 19;
353                         V >>= 19;
354
355                         Cb= clip_yuvtab_40cf[U+ 256];
356                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
357                         Cr= clip_yuvtab_3343[V+ 256];
358
359                         ((uint16_t*)dest)[2*i] =
360                                 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
361                                 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
362                                 clip_table16r[(Y1 + Cr + ditherr1) >>13];
363
364                         ((uint16_t*)dest)[2*i+1] =
365                                 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
366                                 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
367                                 clip_table16r[(Y2 + Cr + ditherr2) >>13];
368                 }
369         }
370         else if(dstFormat==IMGFMT_BGR15)
371         {
372                 int i;
373 #ifdef DITHER1XBPP
374                 static int ditherb1=1<<14;
375                 static int ditherg1=1<<14;
376                 static int ditherr1=2<<14;
377                 static int ditherb2=3<<14;
378                 static int ditherg2=3<<14;
379                 static int ditherr2=0<<14;
380
381                 ditherb1 ^= (1^2)<<14;
382                 ditherg1 ^= (1^2)<<14;
383                 ditherr1 ^= (1^2)<<14;
384                 ditherb2 ^= (3^0)<<14;
385                 ditherg2 ^= (3^0)<<14;
386                 ditherr2 ^= (3^0)<<14;
387 #else
388                 const int ditherb1=0;
389                 const int ditherg1=0;
390                 const int ditherr1=0;
391                 const int ditherb2=0;
392                 const int ditherg2=0;
393                 const int ditherr2=0;
394 #endif
395                 for(i=0; i<(dstW>>1); i++){
396                         int j;
397                         int Y1=0;
398                         int Y2=0;
399                         int U=0;
400                         int V=0;
401                         int Cb, Cr, Cg;
402                         for(j=0; j<lumFilterSize; j++)
403                         {
404                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
405                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
406                         }
407                         for(j=0; j<chrFilterSize; j++)
408                         {
409                                 U += chrSrc[j][i] * chrFilter[j];
410                                 V += chrSrc[j][i+2048] * chrFilter[j];
411                         }
412                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
413                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
414                         U >>= 19;
415                         V >>= 19;
416
417                         Cb= clip_yuvtab_40cf[U+ 256];
418                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
419                         Cr= clip_yuvtab_3343[V+ 256];
420
421                         ((uint16_t*)dest)[2*i] =
422                                 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
423                                 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
424                                 clip_table15r[(Y1 + Cr + ditherr1) >>13];
425
426                         ((uint16_t*)dest)[2*i+1] =
427                                 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
428                                 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
429                                 clip_table15r[(Y2 + Cr + ditherr2) >>13];
430                 }
431         }
432 }
433
434
435 //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
436 //Plain C versions
437 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
438 #define COMPILE_C
439 #endif
440
441 #ifdef CAN_COMPILE_X86_ASM
442
443 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
444 #define COMPILE_MMX
445 #endif
446
447 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
448 #define COMPILE_MMX2
449 #endif
450
451 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
452 #define COMPILE_3DNOW
453 #endif
454 #endif //CAN_COMPILE_X86_ASM
455
456 #undef HAVE_MMX
457 #undef HAVE_MMX2
458 #undef HAVE_3DNOW
459
460 #ifdef COMPILE_C
461 #undef HAVE_MMX
462 #undef HAVE_MMX2
463 #undef HAVE_3DNOW
464 #define RENAME(a) a ## _C
465 #include "swscale_template.c"
466 #endif
467
468 #ifdef CAN_COMPILE_X86_ASM
469
470 //X86 versions
471 /*
472 #undef RENAME
473 #undef HAVE_MMX
474 #undef HAVE_MMX2
475 #undef HAVE_3DNOW
476 #define ARCH_X86
477 #define RENAME(a) a ## _X86
478 #include "swscale_template.c"
479 */
480 //MMX versions
481 #ifdef COMPILE_MMX
482 #undef RENAME
483 #define HAVE_MMX
484 #undef HAVE_MMX2
485 #undef HAVE_3DNOW
486 #define RENAME(a) a ## _MMX
487 #include "swscale_template.c"
488 #endif
489
490 //MMX2 versions
491 #ifdef COMPILE_MMX2
492 #undef RENAME
493 #define HAVE_MMX
494 #define HAVE_MMX2
495 #undef HAVE_3DNOW
496 #define RENAME(a) a ## _MMX2
497 #include "swscale_template.c"
498 #endif
499
500 //3DNOW versions
501 #ifdef COMPILE_3DNOW
502 #undef RENAME
503 #define HAVE_MMX
504 #undef HAVE_MMX2
505 #define HAVE_3DNOW
506 #define RENAME(a) a ## _3DNow
507 #include "swscale_template.c"
508 #endif
509
510 #endif //CAN_COMPILE_X86_ASM
511
512 // minor note: the HAVE_xyz is messed up after that line so dont use it
513
514
515 // old global scaler, dont use for new code
516 // will use sws_flags from the command line
517 void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
518                              int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
519                              int srcW, int srcH, int dstW, int dstH){
520
521         static SwsContext *context=NULL;
522         int dstFormat;
523         int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
524
525         switch(dstbpp)
526         {
527                 case 8 : dstFormat= IMGFMT_Y8;          break;
528                 case 12: dstFormat= IMGFMT_YV12;        break;
529                 case 15: dstFormat= IMGFMT_BGR15;       break;
530                 case 16: dstFormat= IMGFMT_BGR16;       break;
531                 case 24: dstFormat= IMGFMT_BGR24;       break;
532                 case 32: dstFormat= IMGFMT_BGR32;       break;
533                 default: return;
534         }
535
536         if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
537
538         swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
539 }
540
541 // will use sws_flags & src_filter (from cmd line)
542 SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
543 {
544         int flags=0;
545         static int firstTime=1;
546
547 #ifdef ARCH_X86
548         if(gCpuCaps.hasMMX)
549                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
550 #endif
551         if(firstTime)
552         {
553                 firstTime=0;
554                 flags= SWS_PRINT_INFO;
555         }
556         else if(verbose>1) flags= SWS_PRINT_INFO;
557
558         if(src_filter.lumH) freeVec(src_filter.lumH);
559         if(src_filter.lumV) freeVec(src_filter.lumV);
560         if(src_filter.chrH) freeVec(src_filter.chrH);
561         if(src_filter.chrV) freeVec(src_filter.chrV);
562
563         if(sws_lum_gblur!=0.0){
564                 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
565                 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
566         }else{
567                 src_filter.lumH= getIdentityVec();
568                 src_filter.lumV= getIdentityVec();
569         }
570
571         if(sws_chr_gblur!=0.0){
572                 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
573                 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
574         }else{
575                 src_filter.chrH= getIdentityVec();
576                 src_filter.chrV= getIdentityVec();
577         }
578
579         if(sws_chr_sharpen!=0.0){
580                 SwsVector *g= getConstVec(-1.0, 3);
581                 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
582                 g->coeff[1]=2.0;
583                 addVec(id, g);
584                 convVec(src_filter.chrH, id);
585                 convVec(src_filter.chrV, id);
586                 freeVec(g);
587                 freeVec(id);
588         }
589
590         if(sws_lum_sharpen!=0.0){
591                 SwsVector *g= getConstVec(-1.0, 3);
592                 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
593                 g->coeff[1]=2.0;
594                 addVec(id, g);
595                 convVec(src_filter.lumH, id);
596                 convVec(src_filter.lumV, id);
597                 freeVec(g);
598                 freeVec(id);
599         }
600
601         if(sws_chr_hshift)
602                 shiftVec(src_filter.chrH, sws_chr_hshift);
603
604         if(sws_chr_vshift)
605                 shiftVec(src_filter.chrV, sws_chr_vshift);
606
607         normalizeVec(src_filter.chrH, 1.0);
608         normalizeVec(src_filter.chrV, 1.0);
609         normalizeVec(src_filter.lumH, 1.0);
610         normalizeVec(src_filter.lumV, 1.0);
611
612         if(verbose > 1) printVec(src_filter.chrH);
613         if(verbose > 1) printVec(src_filter.lumH);
614
615         switch(sws_flags)
616         {
617                 case 0: flags|= SWS_FAST_BILINEAR; break;
618                 case 1: flags|= SWS_BILINEAR; break;
619                 case 2: flags|= SWS_BICUBIC; break;
620                 case 3: flags|= SWS_X; break;
621                 case 4: flags|= SWS_POINT; break;
622                 case 5: flags|= SWS_AREA; break;
623                 default:flags|= SWS_BILINEAR; break;
624         }
625
626         return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
627 }
628
629
630 static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
631                               int srcW, int dstW, int filterAlign, int one, int flags,
632                               SwsVector *srcFilter, SwsVector *dstFilter)
633 {
634         int i;
635         int filterSize;
636         int filter2Size;
637         int minFilterSize;
638         double *filter=NULL;
639         double *filter2=NULL;
640 #ifdef ARCH_X86
641         if(gCpuCaps.hasMMX)
642                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
643 #endif
644
645         *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
646         (*filterPos)[dstW]=0; // the MMX scaler will read over the end 
647
648         if(ABS(xInc - 0x10000) <10) // unscaled
649         {
650                 int i;
651                 filterSize= 1;
652                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
653                 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
654
655                 for(i=0; i<dstW; i++)
656                 {
657                         filter[i*filterSize]=1;
658                         (*filterPos)[i]=i;
659                 }
660
661         }
662         else if(flags&SWS_POINT) // lame looking point sampling mode
663         {
664                 int i;
665                 int xDstInSrc;
666                 filterSize= 1;
667                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
668                 
669                 xDstInSrc= xInc/2 - 0x8000;
670                 for(i=0; i<dstW; i++)
671                 {
672                         int xx= (xDstInSrc>>16) - (filterSize>>1) + 1;
673
674                         (*filterPos)[i]= xx;
675                         filter[i]= 1.0;
676                         xDstInSrc+= xInc;
677                 }
678         }
679         else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
680         {
681                 int i;
682                 int xDstInSrc;
683                 if     (flags&SWS_BICUBIC) filterSize= 4;
684                 else if(flags&SWS_X      ) filterSize= 4;
685                 else                       filterSize= 2; // SWS_BILINEAR / SWS_AREA 
686 //              printf("%d %d %d\n", filterSize, srcW, dstW);
687                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
688
689                 xDstInSrc= xInc/2 - 0x8000;
690                 for(i=0; i<dstW; i++)
691                 {
692                         int xx= (xDstInSrc>>16) - (filterSize>>1) + 1;
693                         int j;
694
695                         (*filterPos)[i]= xx;
696                         if((flags & SWS_BICUBIC) || (flags & SWS_X))
697                         {
698                                 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
699                                 double y1,y2,y3,y4;
700                                 double A= -0.6;
701                                 if(flags & SWS_BICUBIC){
702                                                 // Equation is from VirtualDub
703                                         y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
704                                         y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
705                                         y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
706                                         y4 = (                  +           A*d*d -       A*d*d*d);
707                                 }else{
708                                                 // cubic interpolation (derived it myself)
709                                         y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
710                                         y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
711                                         y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
712                                         y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
713                                 }
714
715 //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
716                                 filter[i*filterSize + 0]= y1;
717                                 filter[i*filterSize + 1]= y2;
718                                 filter[i*filterSize + 2]= y3;
719                                 filter[i*filterSize + 3]= y4;
720 //                              printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
721                         }
722                         else
723                         {
724                                 //Bilinear upscale / linear interpolate / Area averaging
725                                 for(j=0; j<filterSize; j++)
726                                 {
727                                         double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
728                                         double coeff= 1.0 - d;
729                                         if(coeff<0) coeff=0;
730         //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
731                                         filter[i*filterSize + j]= coeff;
732                                         xx++;
733                                 }
734                         }
735                         xDstInSrc+= xInc;
736                 }
737         }
738         else // downscale
739         {
740                 int xDstInSrc;
741                 if(flags&SWS_BICUBIC)   filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
742                 else if(flags&SWS_X)    filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
743                 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
744                 else /* BILINEAR */     filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
745 //              printf("%d %d %d\n", *filterSize, srcW, dstW);
746                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
747
748                 xDstInSrc= xInc/2 - 0x8000;
749                 for(i=0; i<dstW; i++)
750                 {
751                         int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
752                         int j;
753                         (*filterPos)[i]= xx;
754                         for(j=0; j<filterSize; j++)
755                         {
756                                 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
757                                 double coeff;
758                                 if((flags & SWS_BICUBIC) || (flags & SWS_X))
759                                 {
760                                         double A= -0.75;
761 //                                      d*=2;
762                                         // Equation is from VirtualDub
763                                         if(d<1.0)
764                                                 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
765                                         else if(d<2.0)
766                                                 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
767                                         else
768                                                 coeff=0.0;
769                                 }
770                                 else if(flags & SWS_AREA)
771                                 {
772                                         double srcPixelSize= (1<<16)/(double)xInc;
773                                         if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
774                                         else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
775                                         else coeff=0.0;
776                                 }
777                                 else
778                                 {
779                                         coeff= 1.0 - d;
780                                         if(coeff<0) coeff=0;
781                                 }
782 //                              printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
783                                 filter[i*filterSize + j]= coeff;
784                                 xx++;
785                         }
786                         xDstInSrc+= xInc;
787                 }
788         }
789
790         /* apply src & dst Filter to filter -> filter2
791            free(filter);
792         */
793         filter2Size= filterSize;
794         if(srcFilter) filter2Size+= srcFilter->length - 1;
795         if(dstFilter) filter2Size+= dstFilter->length - 1;
796         filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
797
798         for(i=0; i<dstW; i++)
799         {
800                 int j;
801                 SwsVector scaleFilter;
802                 SwsVector *outVec;
803
804                 scaleFilter.coeff= filter + i*filterSize;
805                 scaleFilter.length= filterSize;
806
807                 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
808                 else          outVec= &scaleFilter;
809
810                 ASSERT(outVec->length == filter2Size)
811                 //FIXME dstFilter
812
813                 for(j=0; j<outVec->length; j++)
814                 {
815                         filter2[i*filter2Size + j]= outVec->coeff[j];
816                 }
817
818                 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
819
820                 if(outVec != &scaleFilter) freeVec(outVec);
821         }
822         free(filter); filter=NULL;
823
824         /* try to reduce the filter-size (step1 find size and shift left) */
825         // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
826         minFilterSize= 0;
827         for(i=dstW-1; i>=0; i--)
828         {
829                 int min= filter2Size;
830                 int j;
831                 double cutOff=0.0;
832
833                 /* get rid off near zero elements on the left by shifting left */
834                 for(j=0; j<filter2Size; j++)
835                 {
836                         int k;
837                         cutOff += ABS(filter2[i*filter2Size]);
838
839                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
840
841                         /* preserve Monotonicity because the core cant handle the filter otherwise */
842                         if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
843
844                         // Move filter coeffs left
845                         for(k=1; k<filter2Size; k++)
846                                 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
847                         filter2[i*filter2Size + k - 1]= 0.0;
848                         (*filterPos)[i]++;
849                 }
850
851                 cutOff=0.0;
852                 /* count near zeros on the right */
853                 for(j=filter2Size-1; j>0; j--)
854                 {
855                         cutOff += ABS(filter2[i*filter2Size + j]);
856
857                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
858                         min--;
859                 }
860
861                 if(min>minFilterSize) minFilterSize= min;
862         }
863
864         filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
865         filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
866         *outFilterSize= filterSize;
867
868         if((flags&SWS_PRINT_INFO) && verbose)
869                 printf("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
870         /* try to reduce the filter-size (step2 reduce it) */
871         for(i=0; i<dstW; i++)
872         {
873                 int j;
874
875                 for(j=0; j<filterSize; j++)
876                 {
877                         if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
878                         else               filter[i*filterSize + j]= filter2[i*filter2Size + j];
879                 }
880         }
881         free(filter2); filter2=NULL;
882         
883         ASSERT(filterSize > 0)
884
885         //FIXME try to align filterpos if possible
886
887         //fix borders
888         for(i=0; i<dstW; i++)
889         {
890                 int j;
891                 if((*filterPos)[i] < 0)
892                 {
893                         // Move filter coeffs left to compensate for filterPos
894                         for(j=1; j<filterSize; j++)
895                         {
896                                 int left= MAX(j + (*filterPos)[i], 0);
897                                 filter[i*filterSize + left] += filter[i*filterSize + j];
898                                 filter[i*filterSize + j]=0;
899                         }
900                         (*filterPos)[i]= 0;
901                 }
902
903                 if((*filterPos)[i] + filterSize > srcW)
904                 {
905                         int shift= (*filterPos)[i] + filterSize - srcW;
906                         // Move filter coeffs right to compensate for filterPos
907                         for(j=filterSize-2; j>=0; j--)
908                         {
909                                 int right= MIN(j + shift, filterSize-1);
910                                 filter[i*filterSize +right] += filter[i*filterSize +j];
911                                 filter[i*filterSize +j]=0;
912                         }
913                         (*filterPos)[i]= srcW - filterSize;
914                 }
915         }
916
917         // Note the +1 is for the MMXscaler which reads over the end
918         *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
919         memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
920
921         /* Normalize & Store in outFilter */
922         for(i=0; i<dstW; i++)
923         {
924                 int j;
925                 double sum=0;
926                 double scale= one;
927                 for(j=0; j<filterSize; j++)
928                 {
929                         sum+= filter[i*filterSize + j];
930                 }
931                 scale/= sum;
932                 for(j=0; j<filterSize; j++)
933                 {
934                         (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
935                 }
936         }
937
938         free(filter);
939 }
940
941 #ifdef ARCH_X86
942 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
943 {
944         uint8_t *fragment;
945         int imm8OfPShufW1;
946         int imm8OfPShufW2;
947         int fragmentLength;
948
949         int xpos, i;
950
951         // create an optimized horizontal scaling routine
952
953         //code fragment
954
955         asm volatile(
956                 "jmp 9f                         \n\t"
957         // Begin
958                 "0:                             \n\t"
959                 "movq (%%esi), %%mm0            \n\t" //FIXME Alignment
960                 "movq %%mm0, %%mm1              \n\t"
961                 "psrlq $8, %%mm0                \n\t"
962                 "punpcklbw %%mm7, %%mm1 \n\t"
963                 "movq %%mm2, %%mm3              \n\t"
964                 "punpcklbw %%mm7, %%mm0 \n\t"
965                 "addw %%bx, %%cx                \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
966                 "pshufw $0xFF, %%mm1, %%mm1     \n\t"
967                 "1:                             \n\t"
968                 "adcl %%edx, %%esi              \n\t" //xx+= (4*lumXInc)>>16 + carry
969                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
970                 "2:                             \n\t"
971                 "psrlw $9, %%mm3                \n\t"
972                 "psubw %%mm1, %%mm0             \n\t"
973                 "pmullw %%mm3, %%mm0            \n\t"
974                 "paddw %%mm6, %%mm2             \n\t" // 2*alpha += xpos&0xFFFF
975                 "psllw $7, %%mm1                \n\t"
976                 "paddw %%mm1, %%mm0             \n\t"
977
978                 "movq %%mm0, (%%edi, %%eax)     \n\t"
979
980                 "addl $8, %%eax                 \n\t"
981         // End
982                 "9:                             \n\t"
983 //              "int $3\n\t"
984                 "leal 0b, %0                    \n\t"
985                 "leal 1b, %1                    \n\t"
986                 "leal 2b, %2                    \n\t"
987                 "decl %1                        \n\t"
988                 "decl %2                        \n\t"
989                 "subl %0, %1                    \n\t"
990                 "subl %0, %2                    \n\t"
991                 "leal 9b, %3                    \n\t"
992                 "subl %0, %3                    \n\t"
993                 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
994                 "=r" (fragmentLength)
995         );
996
997         xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
998
999         for(i=0; i<dstW/8; i++)
1000         {
1001                 int xx=xpos>>16;
1002
1003                 if((i&3) == 0)
1004                 {
1005                         int a=0;
1006                         int b=((xpos+xInc)>>16) - xx;
1007                         int c=((xpos+xInc*2)>>16) - xx;
1008                         int d=((xpos+xInc*3)>>16) - xx;
1009
1010                         memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
1011
1012                         funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
1013                         funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
1014                                 a | (b<<2) | (c<<4) | (d<<6);
1015
1016                         // if we dont need to read 8 bytes than dont :), reduces the chance of
1017                         // crossing a cache line
1018                         if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
1019
1020                         funnyCode[fragmentLength*(i+4)/4]= RET;
1021                 }
1022                 xpos+=xInc;
1023         }
1024 }
1025 #endif // ARCH_X86
1026
1027 //FIXME remove
1028 void SwScale_Init(){
1029 }
1030
1031 static void globalInit(){
1032     // generating tables:
1033     int i;
1034     for(i=0; i<768; i++){
1035         int c= MIN(MAX(i-256, 0), 255);
1036         clip_table[i]=c;
1037         yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1038         yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1039         yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1040         yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1041         yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1042     }
1043
1044     for(i=0; i<768; i++)
1045     {
1046         int v= clip_table[i];
1047         clip_table16b[i]= v>>3;
1048         clip_table16g[i]= (v<<3)&0x07E0;
1049         clip_table16r[i]= (v<<8)&0xF800;
1050         clip_table15b[i]= v>>3;
1051         clip_table15g[i]= (v<<2)&0x03E0;
1052         clip_table15r[i]= (v<<7)&0x7C00;
1053     }
1054
1055 cpuCaps= gCpuCaps;
1056
1057 #ifdef RUNTIME_CPUDETECT
1058 #ifdef CAN_COMPILE_X86_ASM
1059         // ordered per speed fasterst first
1060         if(gCpuCaps.hasMMX2)
1061                 swScale= swScale_MMX2;
1062         else if(gCpuCaps.has3DNow)
1063                 swScale= swScale_3DNow;
1064         else if(gCpuCaps.hasMMX)
1065                 swScale= swScale_MMX;
1066         else
1067                 swScale= swScale_C;
1068
1069 #else
1070         swScale= swScale_C;
1071         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1072 #endif
1073 #else //RUNTIME_CPUDETECT
1074 #ifdef HAVE_MMX2
1075         swScale= swScale_MMX2;
1076         cpuCaps.has3DNow = 0;
1077 #elif defined (HAVE_3DNOW)
1078         swScale= swScale_3DNow;
1079         cpuCaps.hasMMX2 = 0;
1080 #elif defined (HAVE_MMX)
1081         swScale= swScale_MMX;
1082         cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1083 #else
1084         swScale= swScale_C;
1085         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1086 #endif
1087 #endif //!RUNTIME_CPUDETECT
1088 }
1089
1090
1091 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1092                          SwsFilter *srcFilter, SwsFilter *dstFilter){
1093
1094         SwsContext *c;
1095         int i;
1096         SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1097
1098 #ifdef ARCH_X86
1099         if(gCpuCaps.hasMMX)
1100                 asm volatile("emms\n\t"::: "memory");
1101 #endif
1102
1103         if(swScale==NULL) globalInit();
1104
1105         /* sanity check */
1106         if(srcW<4 || srcH<1 || dstW<8 || dstH<1) return NULL; //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1107         
1108         if(srcFormat!=IMGFMT_YV12 && srcFormat!=IMGFMT_I420 && srcFormat!=IMGFMT_IYUV) return NULL;
1109
1110         if(!dstFilter) dstFilter= &dummyFilter;
1111         if(!srcFilter) srcFilter= &dummyFilter;
1112
1113         c= memalign(64, sizeof(SwsContext));
1114         memset(c, 0, sizeof(SwsContext));
1115
1116         c->srcW= srcW;
1117         c->srcH= srcH;
1118         c->dstW= dstW;
1119         c->dstH= dstH;
1120         c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1121         c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1122         c->flags= flags;
1123         c->dstFormat= dstFormat;
1124         c->srcFormat= srcFormat;
1125
1126         if(cpuCaps.hasMMX2)
1127         {
1128                 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1129                 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1130                 {
1131                         if(flags&SWS_PRINT_INFO)
1132                                 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1133                 }
1134         }
1135         else
1136                 c->canMMX2BeUsed=0;
1137
1138         // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1139         // but only for the FAST_BILINEAR mode otherwise do correct scaling
1140         // n-2 is the last chrominance sample available
1141         // this is not perfect, but noone shuld notice the difference, the more correct variant
1142         // would be like the vertical one, but that would require some special code for the
1143         // first and last pixel
1144         if(flags&SWS_FAST_BILINEAR)
1145         {
1146                 if(c->canMMX2BeUsed)    c->lumXInc+= 20;
1147                 //we dont use the x86asm scaler if mmx is available
1148                 else if(cpuCaps.hasMMX) c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1149         }
1150
1151         /* set chrXInc & chrDstW */
1152         if((flags&SWS_FULL_UV_IPOL) && !isHalfChrH(dstFormat))
1153                 c->chrXInc= c->lumXInc>>1, c->chrDstW= dstW;
1154         else
1155                 c->chrXInc= c->lumXInc,    c->chrDstW= (dstW+1)>>1;
1156
1157         /* set chrYInc & chrDstH */
1158         if(isHalfChrV(dstFormat))
1159                 c->chrYInc= c->lumYInc,    c->chrDstH= (dstH+1)>>1;
1160         else    c->chrYInc= c->lumYInc>>1, c->chrDstH= dstH;
1161
1162         /* precalculate horizontal scaler filter coefficients */
1163         {
1164                 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1165
1166                 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1167                                  srcW      ,       dstW, filterAlign, 1<<14, flags,
1168                                  srcFilter->lumH, dstFilter->lumH);
1169                 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1170                                 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1171                                  srcFilter->chrH, dstFilter->chrH);
1172
1173 #ifdef ARCH_X86
1174 // cant downscale !!!
1175                 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1176                 {
1177                         initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode);
1178                         initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
1179                 }
1180 #endif
1181         } // Init Horizontal stuff
1182
1183
1184
1185         /* precalculate vertical scaler filter coefficients */
1186         initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1187                         srcH      ,        dstH, 1, (1<<12)-4, flags,
1188                         srcFilter->lumV, dstFilter->lumV);
1189         initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1190                         (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1191                          srcFilter->chrV, dstFilter->chrV);
1192
1193         // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1194         c->vLumBufSize= c->vLumFilterSize;
1195         c->vChrBufSize= c->vChrFilterSize;
1196         for(i=0; i<dstH; i++)
1197         {
1198                 int chrI= i*c->chrDstH / dstH;
1199                 int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
1200                                  ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1201                 nextSlice&= ~1; // Slices start at even boundaries
1202                 if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
1203                         c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
1204                 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1205                         c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1206         }
1207
1208         // allocate pixbufs (we use dynamic allocation because otherwise we would need to
1209         c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1210         c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
1211         //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
1212         for(i=0; i<c->vLumBufSize; i++)
1213                 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1214         for(i=0; i<c->vChrBufSize; i++)
1215                 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1216
1217         //try to avoid drawing green stuff between the right end and the stride end
1218         for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1219         for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1220
1221         ASSERT(c->chrDstH <= dstH)
1222
1223         // pack filter data for mmx code
1224         if(cpuCaps.hasMMX)
1225         {
1226                 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
1227                 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
1228                 for(i=0; i<c->vLumFilterSize*dstH; i++)
1229                         c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1230                                 c->vLumFilter[i];
1231                 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1232                         c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1233                                 c->vChrFilter[i];
1234         }
1235
1236         if(flags&SWS_PRINT_INFO)
1237         {
1238 #ifdef DITHER1XBPP
1239                 char *dither= " dithered";
1240 #else
1241                 char *dither= "";
1242 #endif
1243                 if(flags&SWS_FAST_BILINEAR)
1244                         fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
1245                 else if(flags&SWS_BILINEAR)
1246                         fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
1247                 else if(flags&SWS_BICUBIC)
1248                         fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
1249                 else if(flags&SWS_POINT)
1250                         fprintf(stderr, "\nSwScaler: Nearest Neighbor / POINT scaler ");
1251                 else if(flags&SWS_AREA)
1252                         fprintf(stderr, "\nSwScaler: Area Averageing scaler ");
1253                 else
1254                         fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
1255
1256                 if(dstFormat==IMGFMT_BGR15)
1257                         fprintf(stderr, "with%s BGR15 output ", dither);
1258                 else if(dstFormat==IMGFMT_BGR16)
1259                         fprintf(stderr, "with%s BGR16 output ", dither);
1260                 else if(dstFormat==IMGFMT_BGR24)
1261                         fprintf(stderr, "with BGR24 output ");
1262                 else if(dstFormat==IMGFMT_BGR32)
1263                         fprintf(stderr, "with BGR32 output ");
1264                 else if(dstFormat==IMGFMT_YV12)
1265                         fprintf(stderr, "with YV12 output ");
1266                 else if(dstFormat==IMGFMT_I420)
1267                         fprintf(stderr, "with I420 output ");
1268                 else if(dstFormat==IMGFMT_IYUV)
1269                         fprintf(stderr, "with IYUV output ");
1270                 else
1271                         fprintf(stderr, "without output ");
1272
1273                 if(cpuCaps.hasMMX2)
1274                         fprintf(stderr, "using MMX2\n");
1275                 else if(cpuCaps.has3DNow)
1276                         fprintf(stderr, "using 3DNOW\n");
1277                 else if(cpuCaps.hasMMX)
1278                         fprintf(stderr, "using MMX\n");
1279                 else
1280                         fprintf(stderr, "using C\n");
1281         }
1282
1283         if((flags & SWS_PRINT_INFO) && verbose)
1284         {
1285                 if(cpuCaps.hasMMX)
1286                 {
1287                         if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1288                                 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1289                         else
1290                         {
1291                                 if(c->hLumFilterSize==4)
1292                                         printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1293                                 else if(c->hLumFilterSize==8)
1294                                         printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1295                                 else
1296                                         printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1297
1298                                 if(c->hChrFilterSize==4)
1299                                         printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1300                                 else if(c->hChrFilterSize==8)
1301                                         printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1302                                 else
1303                                         printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1304                         }
1305                 }
1306                 else
1307                 {
1308 #ifdef ARCH_X86
1309                         printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1310 #else
1311                         if(flags & SWS_FAST_BILINEAR)
1312                                 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1313                         else
1314                                 printf("SwScaler: using C scaler for horizontal scaling\n");
1315 #endif
1316                 }
1317                 if(isPlanarYUV(dstFormat))
1318                 {
1319                         if(c->vLumFilterSize==1)
1320                                 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1321                         else
1322                                 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1323                 }
1324                 else
1325                 {
1326                         if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1327                                 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1328                                        "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1329                         else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1330                                 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1331                         else
1332                                 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1333                 }
1334
1335                 if(dstFormat==IMGFMT_BGR24)
1336                         printf("SwScaler: using %s YV12->BGR24 Converter\n",
1337                                 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1338                 else if(dstFormat==IMGFMT_BGR32)
1339                         printf("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1340                 else if(dstFormat==IMGFMT_BGR16)
1341                         printf("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1342                 else if(dstFormat==IMGFMT_BGR15)
1343                         printf("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1344
1345                 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1346         }
1347
1348         return c;
1349 }
1350
1351 /**
1352  * returns a normalized gaussian curve used to filter stuff
1353  * quality=3 is high quality, lowwer is lowwer quality
1354  */
1355
1356 SwsVector *getGaussianVec(double variance, double quality){
1357         const int length= (int)(variance*quality + 0.5) | 1;
1358         int i;
1359         double *coeff= memalign(sizeof(double), length*sizeof(double));
1360         double middle= (length-1)*0.5;
1361         SwsVector *vec= malloc(sizeof(SwsVector));
1362
1363         vec->coeff= coeff;
1364         vec->length= length;
1365
1366         for(i=0; i<length; i++)
1367         {
1368                 double dist= i-middle;
1369                 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1370         }
1371
1372         normalizeVec(vec, 1.0);
1373
1374         return vec;
1375 }
1376
1377 SwsVector *getConstVec(double c, int length){
1378         int i;
1379         double *coeff= memalign(sizeof(double), length*sizeof(double));
1380         SwsVector *vec= malloc(sizeof(SwsVector));
1381
1382         vec->coeff= coeff;
1383         vec->length= length;
1384
1385         for(i=0; i<length; i++)
1386                 coeff[i]= c;
1387
1388         return vec;
1389 }
1390
1391
1392 SwsVector *getIdentityVec(void){
1393         double *coeff= memalign(sizeof(double), sizeof(double));
1394         SwsVector *vec= malloc(sizeof(SwsVector));
1395         coeff[0]= 1.0;
1396
1397         vec->coeff= coeff;
1398         vec->length= 1;
1399
1400         return vec;
1401 }
1402
1403 void normalizeVec(SwsVector *a, double height){
1404         int i;
1405         double sum=0;
1406         double inv;
1407
1408         for(i=0; i<a->length; i++)
1409                 sum+= a->coeff[i];
1410
1411         inv= height/sum;
1412
1413         for(i=0; i<a->length; i++)
1414                 a->coeff[i]*= height;
1415 }
1416
1417 void scaleVec(SwsVector *a, double scalar){
1418         int i;
1419
1420         for(i=0; i<a->length; i++)
1421                 a->coeff[i]*= scalar;
1422 }
1423
1424 static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
1425         int length= a->length + b->length - 1;
1426         double *coeff= memalign(sizeof(double), length*sizeof(double));
1427         int i, j;
1428         SwsVector *vec= malloc(sizeof(SwsVector));
1429
1430         vec->coeff= coeff;
1431         vec->length= length;
1432
1433         for(i=0; i<length; i++) coeff[i]= 0.0;
1434
1435         for(i=0; i<a->length; i++)
1436         {
1437                 for(j=0; j<b->length; j++)
1438                 {
1439                         coeff[i+j]+= a->coeff[i]*b->coeff[j];
1440                 }
1441         }
1442
1443         return vec;
1444 }
1445
1446 static SwsVector *sumVec(SwsVector *a, SwsVector *b){
1447         int length= MAX(a->length, b->length);
1448         double *coeff= memalign(sizeof(double), length*sizeof(double));
1449         int i;
1450         SwsVector *vec= malloc(sizeof(SwsVector));
1451
1452         vec->coeff= coeff;
1453         vec->length= length;
1454
1455         for(i=0; i<length; i++) coeff[i]= 0.0;
1456
1457         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1458         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1459
1460         return vec;
1461 }
1462
1463 static SwsVector *diffVec(SwsVector *a, SwsVector *b){
1464         int length= MAX(a->length, b->length);
1465         double *coeff= memalign(sizeof(double), length*sizeof(double));
1466         int i;
1467         SwsVector *vec= malloc(sizeof(SwsVector));
1468
1469         vec->coeff= coeff;
1470         vec->length= length;
1471
1472         for(i=0; i<length; i++) coeff[i]= 0.0;
1473
1474         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1475         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1476
1477         return vec;
1478 }
1479
1480 /* shift left / or right if "shift" is negative */
1481 static SwsVector *getShiftedVec(SwsVector *a, int shift){
1482         int length= a->length + ABS(shift)*2;
1483         double *coeff= memalign(sizeof(double), length*sizeof(double));
1484         int i;
1485         SwsVector *vec= malloc(sizeof(SwsVector));
1486
1487         vec->coeff= coeff;
1488         vec->length= length;
1489
1490         for(i=0; i<length; i++) coeff[i]= 0.0;
1491
1492         for(i=0; i<a->length; i++)
1493         {
1494                 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1495         }
1496
1497         return vec;
1498 }
1499
1500 void shiftVec(SwsVector *a, int shift){
1501         SwsVector *shifted= getShiftedVec(a, shift);
1502         free(a->coeff);
1503         a->coeff= shifted->coeff;
1504         a->length= shifted->length;
1505         free(shifted);
1506 }
1507
1508 void addVec(SwsVector *a, SwsVector *b){
1509         SwsVector *sum= sumVec(a, b);
1510         free(a->coeff);
1511         a->coeff= sum->coeff;
1512         a->length= sum->length;
1513         free(sum);
1514 }
1515
1516 void subVec(SwsVector *a, SwsVector *b){
1517         SwsVector *diff= diffVec(a, b);
1518         free(a->coeff);
1519         a->coeff= diff->coeff;
1520         a->length= diff->length;
1521         free(diff);
1522 }
1523
1524 void convVec(SwsVector *a, SwsVector *b){
1525         SwsVector *conv= getConvVec(a, b);
1526         free(a->coeff);
1527         a->coeff= conv->coeff;
1528         a->length= conv->length;
1529         free(conv);
1530 }
1531
1532 SwsVector *cloneVec(SwsVector *a){
1533         double *coeff= memalign(sizeof(double), a->length*sizeof(double));
1534         int i;
1535         SwsVector *vec= malloc(sizeof(SwsVector));
1536
1537         vec->coeff= coeff;
1538         vec->length= a->length;
1539
1540         for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
1541
1542         return vec;
1543 }
1544
1545 void printVec(SwsVector *a){
1546         int i;
1547         double max=0;
1548         double min=0;
1549         double range;
1550
1551         for(i=0; i<a->length; i++)
1552                 if(a->coeff[i]>max) max= a->coeff[i];
1553
1554         for(i=0; i<a->length; i++)
1555                 if(a->coeff[i]<min) min= a->coeff[i];
1556
1557         range= max - min;
1558
1559         for(i=0; i<a->length; i++)
1560         {
1561                 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1562                 printf("%1.3f ", a->coeff[i]);
1563                 for(;x>0; x--) printf(" ");
1564                 printf("|\n");
1565         }
1566 }
1567
1568 void freeVec(SwsVector *a){
1569         if(!a) return;
1570         if(a->coeff) free(a->coeff);
1571         a->coeff=NULL;
1572         a->length=0;
1573         free(a);
1574 }
1575
1576 void freeSwsContext(SwsContext *c){
1577         int i;
1578
1579         if(!c) return;
1580
1581         if(c->lumPixBuf)
1582         {
1583                 for(i=0; i<c->vLumBufSize; i++)
1584                 {
1585                         if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1586                         c->lumPixBuf[i]=NULL;
1587                 }
1588                 free(c->lumPixBuf);
1589                 c->lumPixBuf=NULL;
1590         }
1591
1592         if(c->chrPixBuf)
1593         {
1594                 for(i=0; i<c->vChrBufSize; i++)
1595                 {
1596                         if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1597                         c->chrPixBuf[i]=NULL;
1598                 }
1599                 free(c->chrPixBuf);
1600                 c->chrPixBuf=NULL;
1601         }
1602
1603         if(c->vLumFilter) free(c->vLumFilter);
1604         c->vLumFilter = NULL;
1605         if(c->vChrFilter) free(c->vChrFilter);
1606         c->vChrFilter = NULL;
1607         if(c->hLumFilter) free(c->hLumFilter);
1608         c->hLumFilter = NULL;
1609         if(c->hChrFilter) free(c->hChrFilter);
1610         c->hChrFilter = NULL;
1611
1612         if(c->vLumFilterPos) free(c->vLumFilterPos);
1613         c->vLumFilterPos = NULL;
1614         if(c->vChrFilterPos) free(c->vChrFilterPos);
1615         c->vChrFilterPos = NULL;
1616         if(c->hLumFilterPos) free(c->hLumFilterPos);
1617         c->hLumFilterPos = NULL;
1618         if(c->hChrFilterPos) free(c->hChrFilterPos);
1619         c->hChrFilterPos = NULL;
1620
1621         if(c->lumMmxFilter) free(c->lumMmxFilter);
1622         c->lumMmxFilter = NULL;
1623         if(c->chrMmxFilter) free(c->chrMmxFilter);
1624         c->chrMmxFilter = NULL;
1625
1626         free(c);
1627 }
1628
1629