]> git.sesse.net Git - ffmpeg/blob - postproc/swscale.c
various openbsd and general warning fixes - patch by Björn Sandell <biorn@dce.chalmer...
[ffmpeg] / postproc / swscale.c
1 /*
2     Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20   supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800
21   supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
22   BGR15/16 support dithering
23   
24   unscaled special converters
25   YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
26   YV12/I420/IYUV -> YV12/I420/IYUV
27   YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
28   BGR24 -> BGR32 & RGB24 -> RGB32
29   BGR32 -> BGR24 & RGB32 -> RGB24
30   BGR15 -> BGR16
31 */
32
33 /* 
34 tested special converters
35  YV12/I420 -> BGR16
36  YV12 -> YV12
37  BGR15 -> BGR16
38  BGR16 -> BGR16
39
40 untested special converters
41   YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
42   YV12/I420 -> YV12/I420
43   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
44   BGR24 -> BGR32 & RGB24 -> RGB32
45   BGR32 -> BGR24 & RGB32 -> RGB24
46   BGR24 -> YV12
47 */
48
49 #include <inttypes.h>
50 #include <string.h>
51 #include <math.h>
52 #include <stdio.h>
53 #include "../config.h"
54 #include "../mangle.h"
55 #include <assert.h>
56 #ifdef HAVE_MALLOC_H
57 #include <malloc.h>
58 #else
59 #include <stdlib.h>
60 #endif
61 #include "swscale.h"
62 #include "../cpudetect.h"
63 #include "../bswap.h"
64 #include "../libvo/img_format.h"
65 #include "rgb2rgb.h"
66 #include "../libvo/fastmemcpy.h"
67 #include "../mp_msg.h"
68 #undef MOVNTQ
69 #undef PAVGB
70
71 //#undef HAVE_MMX2
72 //#define HAVE_3DNOW
73 //#undef HAVE_MMX
74 //#undef ARCH_X86
75 //#define WORDS_BIGENDIAN
76 #define DITHER1XBPP
77
78 #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
79
80 #define RET 0xC3 //near return opcode for X86
81
82 #ifdef MP_DEBUG
83 #define ASSERT(x) assert(x);
84 #else
85 #define ASSERT(x) ;
86 #endif
87
88 #ifdef M_PI
89 #define PI M_PI
90 #else
91 #define PI 3.14159265358979323846
92 #endif
93
94 //FIXME replace this with something faster
95 #define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
96 #define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
97 #define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
98 #define isHalfChrH(x)  ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
99 #define isPacked(x)    ((x)==IMGFMT_YUY2 || ((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR || ((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
100 #define isGray(x)      ((x)==IMGFMT_Y800)
101 #define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
102                         || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
103                         || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
104                         || (x)==IMGFMT_Y800)
105 #define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
106                         || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
107 #define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
108
109 #define RGB2YUV_SHIFT 16
110 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
111 #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
112 #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
113 #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
114 #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
115 #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
116 #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
117 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
118 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
119
120 extern int verbose; // defined in mplayer.c
121 /*
122 NOTES
123 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
124
125 TODO
126 more intelligent missalignment avoidance for the horizontal scaler
127 write special vertical cubic upscale version
128 Optimize C code (yv12 / minmax)
129 add support for packed pixel yuv input & output
130 add support for Y8 output
131 optimize bgr24 & bgr32
132 add BGR4 output support
133 write special BGR->BGR scaler
134 deglobalize yuv2rgb*.c
135 */
136
137 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
138 #define MIN(a,b) ((a) > (b) ? (b) : (a))
139 #define MAX(a,b) ((a) < (b) ? (b) : (a))
140
141 #ifdef ARCH_X86
142 #define CAN_COMPILE_X86_ASM
143 #endif
144
145 #ifdef CAN_COMPILE_X86_ASM
146 static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
147 static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
148 static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
149 static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
150 static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
151 static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
152 static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
153 static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
154 static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
155 static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
156 static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
157 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
158 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
159 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
160 static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
161
162 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
163 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
164 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
165 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
166
167 static uint64_t __attribute__((aligned(8))) dither4[2]={
168         0x0103010301030103LL,
169         0x0200020002000200LL,};
170
171 static uint64_t __attribute__((aligned(8))) dither8[2]={
172         0x0602060206020602LL,
173         0x0004000400040004LL,};
174
175 static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
176 static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
177 static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
178 static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
179 static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
180 static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
181
182 static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
183 static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
184 static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
185
186 #ifdef FAST_BGR2YV12
187 static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
188 static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
189 static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
190 #else
191 static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
192 static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
193 static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
194 #endif
195 static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
196 static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
197 static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
198
199 // FIXME remove
200 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
201 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
202 #endif
203
204 // clipping helper table for C implementations:
205 static unsigned char clip_table[768];
206
207 static unsigned short clip_table16b[768];
208 static unsigned short clip_table16g[768];
209 static unsigned short clip_table16r[768];
210 static unsigned short clip_table15b[768];
211 static unsigned short clip_table15g[768];
212 static unsigned short clip_table15r[768];
213
214 // yuv->rgb conversion tables:
215 static    int yuvtab_2568[256];
216 static    int yuvtab_3343[256];
217 static    int yuvtab_0c92[256];
218 static    int yuvtab_1a1e[256];
219 static    int yuvtab_40cf[256];
220 // Needed for cubic scaler to catch overflows
221 static    int clip_yuvtab_2568[768];
222 static    int clip_yuvtab_3343[768];
223 static    int clip_yuvtab_0c92[768];
224 static    int clip_yuvtab_1a1e[768];
225 static    int clip_yuvtab_40cf[768];
226
227 //global sws_flags from the command line
228 int sws_flags=2;
229
230 //global srcFilter
231 SwsFilter src_filter= {NULL, NULL, NULL, NULL};
232
233 float sws_lum_gblur= 0.0;
234 float sws_chr_gblur= 0.0;
235 int sws_chr_vshift= 0;
236 int sws_chr_hshift= 0;
237 float sws_chr_sharpen= 0.0;
238 float sws_lum_sharpen= 0.0;
239
240 /* cpuCaps combined from cpudetect and whats actually compiled in
241    (if there is no support for something compiled in it wont appear here) */
242 static CpuCaps cpuCaps;
243
244 void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
245              int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
246
247 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
248
249 #ifdef CAN_COMPILE_X86_ASM
250 void in_asm_used_var_warning_killer()
251 {
252  volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
253  bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
254  M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
255  if(i) i=0;
256 }
257 #endif
258
259 static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
260                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
261                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
262 {
263         //FIXME Optimize (just quickly writen not opti..)
264         int i;
265         for(i=0; i<dstW; i++)
266         {
267                 int val=0;
268                 int j;
269                 for(j=0; j<lumFilterSize; j++)
270                         val += lumSrc[j][i] * lumFilter[j];
271
272                 dest[i]= MIN(MAX(val>>19, 0), 255);
273         }
274
275         if(uDest != NULL)
276                 for(i=0; i<(dstW>>1); i++)
277                 {
278                         int u=0;
279                         int v=0;
280                         int j;
281                         for(j=0; j<chrFilterSize; j++)
282                         {
283                                 u += chrSrc[j][i] * chrFilter[j];
284                                 v += chrSrc[j][i + 2048] * chrFilter[j];
285                         }
286
287                         uDest[i]= MIN(MAX(u>>19, 0), 255);
288                         vDest[i]= MIN(MAX(v>>19, 0), 255);
289                 }
290 }
291
292 static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
293                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
294                                     uint8_t *dest, int dstW, int dstFormat)
295 {
296         if(dstFormat==IMGFMT_BGR32)
297         {
298                 int i;
299 #ifdef WORDS_BIGENDIAN
300         dest++;
301 #endif
302                 for(i=0; i<(dstW>>1); i++){
303                         int j;
304                         int Y1=0;
305                         int Y2=0;
306                         int U=0;
307                         int V=0;
308                         int Cb, Cr, Cg;
309                         for(j=0; j<lumFilterSize; j++)
310                         {
311                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
312                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
313                         }
314                         for(j=0; j<chrFilterSize; j++)
315                         {
316                                 U += chrSrc[j][i] * chrFilter[j];
317                                 V += chrSrc[j][i+2048] * chrFilter[j];
318                         }
319                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
320                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
321                         U >>= 19;
322                         V >>= 19;
323
324                         Cb= clip_yuvtab_40cf[U+ 256];
325                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
326                         Cr= clip_yuvtab_3343[V+ 256];
327
328                         dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
329                         dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
330                         dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
331
332                         dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
333                         dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
334                         dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
335                 }
336         }
337         else if(dstFormat==IMGFMT_BGR24)
338         {
339                 int i;
340                 for(i=0; i<(dstW>>1); i++){
341                         int j;
342                         int Y1=0;
343                         int Y2=0;
344                         int U=0;
345                         int V=0;
346                         int Cb, Cr, Cg;
347                         for(j=0; j<lumFilterSize; j++)
348                         {
349                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
350                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
351                         }
352                         for(j=0; j<chrFilterSize; j++)
353                         {
354                                 U += chrSrc[j][i] * chrFilter[j];
355                                 V += chrSrc[j][i+2048] * chrFilter[j];
356                         }
357                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
358                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
359                         U >>= 19;
360                         V >>= 19;
361
362                         Cb= clip_yuvtab_40cf[U+ 256];
363                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
364                         Cr= clip_yuvtab_3343[V+ 256];
365
366                         dest[0]=clip_table[((Y1 + Cb) >>13)];
367                         dest[1]=clip_table[((Y1 + Cg) >>13)];
368                         dest[2]=clip_table[((Y1 + Cr) >>13)];
369
370                         dest[3]=clip_table[((Y2 + Cb) >>13)];
371                         dest[4]=clip_table[((Y2 + Cg) >>13)];
372                         dest[5]=clip_table[((Y2 + Cr) >>13)];
373                         dest+=6;
374                 }
375         }
376         else if(dstFormat==IMGFMT_BGR16)
377         {
378                 int i;
379 #ifdef DITHER1XBPP
380                 static int ditherb1=1<<14;
381                 static int ditherg1=1<<13;
382                 static int ditherr1=2<<14;
383                 static int ditherb2=3<<14;
384                 static int ditherg2=3<<13;
385                 static int ditherr2=0<<14;
386
387                 ditherb1 ^= (1^2)<<14;
388                 ditherg1 ^= (1^2)<<13;
389                 ditherr1 ^= (1^2)<<14;
390                 ditherb2 ^= (3^0)<<14;
391                 ditherg2 ^= (3^0)<<13;
392                 ditherr2 ^= (3^0)<<14;
393 #else
394                 const int ditherb1=0;
395                 const int ditherg1=0;
396                 const int ditherr1=0;
397                 const int ditherb2=0;
398                 const int ditherg2=0;
399                 const int ditherr2=0;
400 #endif
401                 for(i=0; i<(dstW>>1); i++){
402                         int j;
403                         int Y1=0;
404                         int Y2=0;
405                         int U=0;
406                         int V=0;
407                         int Cb, Cr, Cg;
408                         for(j=0; j<lumFilterSize; j++)
409                         {
410                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
411                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
412                         }
413                         for(j=0; j<chrFilterSize; j++)
414                         {
415                                 U += chrSrc[j][i] * chrFilter[j];
416                                 V += chrSrc[j][i+2048] * chrFilter[j];
417                         }
418                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
419                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
420                         U >>= 19;
421                         V >>= 19;
422
423                         Cb= clip_yuvtab_40cf[U+ 256];
424                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
425                         Cr= clip_yuvtab_3343[V+ 256];
426
427                         ((uint16_t*)dest)[2*i] =
428                                 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
429                                 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
430                                 clip_table16r[(Y1 + Cr + ditherr1) >>13];
431
432                         ((uint16_t*)dest)[2*i+1] =
433                                 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
434                                 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
435                                 clip_table16r[(Y2 + Cr + ditherr2) >>13];
436                 }
437         }
438         else if(dstFormat==IMGFMT_BGR15)
439         {
440                 int i;
441 #ifdef DITHER1XBPP
442                 static int ditherb1=1<<14;
443                 static int ditherg1=1<<14;
444                 static int ditherr1=2<<14;
445                 static int ditherb2=3<<14;
446                 static int ditherg2=3<<14;
447                 static int ditherr2=0<<14;
448
449                 ditherb1 ^= (1^2)<<14;
450                 ditherg1 ^= (1^2)<<14;
451                 ditherr1 ^= (1^2)<<14;
452                 ditherb2 ^= (3^0)<<14;
453                 ditherg2 ^= (3^0)<<14;
454                 ditherr2 ^= (3^0)<<14;
455 #else
456                 const int ditherb1=0;
457                 const int ditherg1=0;
458                 const int ditherr1=0;
459                 const int ditherb2=0;
460                 const int ditherg2=0;
461                 const int ditherr2=0;
462 #endif
463                 for(i=0; i<(dstW>>1); i++){
464                         int j;
465                         int Y1=0;
466                         int Y2=0;
467                         int U=0;
468                         int V=0;
469                         int Cb, Cr, Cg;
470                         for(j=0; j<lumFilterSize; j++)
471                         {
472                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
473                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
474                         }
475                         for(j=0; j<chrFilterSize; j++)
476                         {
477                                 U += chrSrc[j][i] * chrFilter[j];
478                                 V += chrSrc[j][i+2048] * chrFilter[j];
479                         }
480                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
481                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
482                         U >>= 19;
483                         V >>= 19;
484
485                         Cb= clip_yuvtab_40cf[U+ 256];
486                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
487                         Cr= clip_yuvtab_3343[V+ 256];
488
489                         ((uint16_t*)dest)[2*i] =
490                                 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
491                                 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
492                                 clip_table15r[(Y1 + Cr + ditherr1) >>13];
493
494                         ((uint16_t*)dest)[2*i+1] =
495                                 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
496                                 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
497                                 clip_table15r[(Y2 + Cr + ditherr2) >>13];
498                 }
499         }
500 }
501
502
503 //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
504 //Plain C versions
505 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
506 #define COMPILE_C
507 #endif
508
509 #ifdef CAN_COMPILE_X86_ASM
510
511 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
512 #define COMPILE_MMX
513 #endif
514
515 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
516 #define COMPILE_MMX2
517 #endif
518
519 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
520 #define COMPILE_3DNOW
521 #endif
522 #endif //CAN_COMPILE_X86_ASM
523
524 #undef HAVE_MMX
525 #undef HAVE_MMX2
526 #undef HAVE_3DNOW
527
528 #ifdef COMPILE_C
529 #undef HAVE_MMX
530 #undef HAVE_MMX2
531 #undef HAVE_3DNOW
532 #define RENAME(a) a ## _C
533 #include "swscale_template.c"
534 #endif
535
536 #ifdef CAN_COMPILE_X86_ASM
537
538 //X86 versions
539 /*
540 #undef RENAME
541 #undef HAVE_MMX
542 #undef HAVE_MMX2
543 #undef HAVE_3DNOW
544 #define ARCH_X86
545 #define RENAME(a) a ## _X86
546 #include "swscale_template.c"
547 */
548 //MMX versions
549 #ifdef COMPILE_MMX
550 #undef RENAME
551 #define HAVE_MMX
552 #undef HAVE_MMX2
553 #undef HAVE_3DNOW
554 #define RENAME(a) a ## _MMX
555 #include "swscale_template.c"
556 #endif
557
558 //MMX2 versions
559 #ifdef COMPILE_MMX2
560 #undef RENAME
561 #define HAVE_MMX
562 #define HAVE_MMX2
563 #undef HAVE_3DNOW
564 #define RENAME(a) a ## _MMX2
565 #include "swscale_template.c"
566 #endif
567
568 //3DNOW versions
569 #ifdef COMPILE_3DNOW
570 #undef RENAME
571 #define HAVE_MMX
572 #undef HAVE_MMX2
573 #define HAVE_3DNOW
574 #define RENAME(a) a ## _3DNow
575 #include "swscale_template.c"
576 #endif
577
578 #endif //CAN_COMPILE_X86_ASM
579
580 // minor note: the HAVE_xyz is messed up after that line so dont use it
581
582
583 // old global scaler, dont use for new code
584 // will use sws_flags from the command line
585 void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
586                              int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
587                              int srcW, int srcH, int dstW, int dstH){
588
589         static SwsContext *context=NULL;
590         int dstFormat;
591         int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
592
593         switch(dstbpp)
594         {
595                 case 8 : dstFormat= IMGFMT_Y8;          break;
596                 case 12: dstFormat= IMGFMT_YV12;        break;
597                 case 15: dstFormat= IMGFMT_BGR15;       break;
598                 case 16: dstFormat= IMGFMT_BGR16;       break;
599                 case 24: dstFormat= IMGFMT_BGR24;       break;
600                 case 32: dstFormat= IMGFMT_BGR32;       break;
601                 default: return;
602         }
603
604         if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
605
606         context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
607 }
608
609 // will use sws_flags & src_filter (from cmd line)
610 SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
611 {
612         int flags=0;
613         static int firstTime=1;
614
615 #ifdef ARCH_X86
616         if(gCpuCaps.hasMMX)
617                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
618 #endif
619         if(firstTime)
620         {
621                 firstTime=0;
622                 flags= SWS_PRINT_INFO;
623         }
624         else if(verbose>1) flags= SWS_PRINT_INFO;
625
626         if(src_filter.lumH) freeVec(src_filter.lumH);
627         if(src_filter.lumV) freeVec(src_filter.lumV);
628         if(src_filter.chrH) freeVec(src_filter.chrH);
629         if(src_filter.chrV) freeVec(src_filter.chrV);
630
631         if(sws_lum_gblur!=0.0){
632                 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
633                 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
634         }else{
635                 src_filter.lumH= getIdentityVec();
636                 src_filter.lumV= getIdentityVec();
637         }
638
639         if(sws_chr_gblur!=0.0){
640                 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
641                 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
642         }else{
643                 src_filter.chrH= getIdentityVec();
644                 src_filter.chrV= getIdentityVec();
645         }
646
647         if(sws_chr_sharpen!=0.0){
648                 SwsVector *g= getConstVec(-1.0, 3);
649                 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
650                 g->coeff[1]=2.0;
651                 addVec(id, g);
652                 convVec(src_filter.chrH, id);
653                 convVec(src_filter.chrV, id);
654                 freeVec(g);
655                 freeVec(id);
656         }
657
658         if(sws_lum_sharpen!=0.0){
659                 SwsVector *g= getConstVec(-1.0, 3);
660                 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
661                 g->coeff[1]=2.0;
662                 addVec(id, g);
663                 convVec(src_filter.lumH, id);
664                 convVec(src_filter.lumV, id);
665                 freeVec(g);
666                 freeVec(id);
667         }
668
669         if(sws_chr_hshift)
670                 shiftVec(src_filter.chrH, sws_chr_hshift);
671
672         if(sws_chr_vshift)
673                 shiftVec(src_filter.chrV, sws_chr_vshift);
674
675         normalizeVec(src_filter.chrH, 1.0);
676         normalizeVec(src_filter.chrV, 1.0);
677         normalizeVec(src_filter.lumH, 1.0);
678         normalizeVec(src_filter.lumV, 1.0);
679
680         if(verbose > 1) printVec(src_filter.chrH);
681         if(verbose > 1) printVec(src_filter.lumH);
682
683         switch(sws_flags)
684         {
685                 case 0: flags|= SWS_FAST_BILINEAR; break;
686                 case 1: flags|= SWS_BILINEAR; break;
687                 case 2: flags|= SWS_BICUBIC; break;
688                 case 3: flags|= SWS_X; break;
689                 case 4: flags|= SWS_POINT; break;
690                 case 5: flags|= SWS_AREA; break;
691                 default:flags|= SWS_BILINEAR; break;
692         }
693
694         return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
695 }
696
697
698 static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
699                               int srcW, int dstW, int filterAlign, int one, int flags,
700                               SwsVector *srcFilter, SwsVector *dstFilter)
701 {
702         int i;
703         int filterSize;
704         int filter2Size;
705         int minFilterSize;
706         double *filter=NULL;
707         double *filter2=NULL;
708 #ifdef ARCH_X86
709         if(gCpuCaps.hasMMX)
710                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
711 #endif
712
713         // Note the +1 is for the MMXscaler which reads over the end
714         *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
715
716         if(ABS(xInc - 0x10000) <10) // unscaled
717         {
718                 int i;
719                 filterSize= 1;
720                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
721                 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
722
723                 for(i=0; i<dstW; i++)
724                 {
725                         filter[i*filterSize]=1;
726                         (*filterPos)[i]=i;
727                 }
728
729         }
730         else if(flags&SWS_POINT) // lame looking point sampling mode
731         {
732                 int i;
733                 int xDstInSrc;
734                 filterSize= 1;
735                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
736                 
737                 xDstInSrc= xInc/2 - 0x8000;
738                 for(i=0; i<dstW; i++)
739                 {
740                         int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
741
742                         (*filterPos)[i]= xx;
743                         filter[i]= 1.0;
744                         xDstInSrc+= xInc;
745                 }
746         }
747         else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
748         {
749                 int i;
750                 int xDstInSrc;
751                 if     (flags&SWS_BICUBIC) filterSize= 4;
752                 else if(flags&SWS_X      ) filterSize= 4;
753                 else                       filterSize= 2; // SWS_BILINEAR / SWS_AREA 
754 //              printf("%d %d %d\n", filterSize, srcW, dstW);
755                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
756
757                 xDstInSrc= xInc/2 - 0x8000;
758                 for(i=0; i<dstW; i++)
759                 {
760                         int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
761                         int j;
762
763                         (*filterPos)[i]= xx;
764                         if((flags & SWS_BICUBIC) || (flags & SWS_X))
765                         {
766                                 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
767                                 double y1,y2,y3,y4;
768                                 double A= -0.6;
769                                 if(flags & SWS_BICUBIC){
770                                                 // Equation is from VirtualDub
771                                         y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
772                                         y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
773                                         y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
774                                         y4 = (                  +           A*d*d -       A*d*d*d);
775                                 }else{
776                                                 // cubic interpolation (derived it myself)
777                                         y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
778                                         y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
779                                         y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
780                                         y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
781                                 }
782
783 //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
784                                 filter[i*filterSize + 0]= y1;
785                                 filter[i*filterSize + 1]= y2;
786                                 filter[i*filterSize + 2]= y3;
787                                 filter[i*filterSize + 3]= y4;
788 //                              printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
789                         }
790                         else
791                         {
792                                 //Bilinear upscale / linear interpolate / Area averaging
793                                 for(j=0; j<filterSize; j++)
794                                 {
795                                         double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
796                                         double coeff= 1.0 - d;
797                                         if(coeff<0) coeff=0;
798         //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
799                                         filter[i*filterSize + j]= coeff;
800                                         xx++;
801                                 }
802                         }
803                         xDstInSrc+= xInc;
804                 }
805         }
806         else // downscale
807         {
808                 int xDstInSrc;
809                 ASSERT(dstW <= srcW)
810
811                 if(flags&SWS_BICUBIC)   filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
812                 else if(flags&SWS_X)    filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
813                 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
814                 else /* BILINEAR */     filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
815 //              printf("%d %d %d\n", *filterSize, srcW, dstW);
816                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
817
818                 xDstInSrc= xInc/2 - 0x8000;
819                 for(i=0; i<dstW; i++)
820                 {
821                         int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
822                         int j;
823                         (*filterPos)[i]= xx;
824                         for(j=0; j<filterSize; j++)
825                         {
826                                 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
827                                 double coeff;
828                                 if((flags & SWS_BICUBIC) || (flags & SWS_X))
829                                 {
830                                         double A= -0.75;
831 //                                      d*=2;
832                                         // Equation is from VirtualDub
833                                         if(d<1.0)
834                                                 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
835                                         else if(d<2.0)
836                                                 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
837                                         else
838                                                 coeff=0.0;
839                                 }
840                                 else if(flags & SWS_AREA)
841                                 {
842                                         double srcPixelSize= (1<<16)/(double)xInc;
843                                         if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
844                                         else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
845                                         else coeff=0.0;
846                                 }
847                                 else
848                                 {
849                                         coeff= 1.0 - d;
850                                         if(coeff<0) coeff=0;
851                                 }
852 //                              printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
853                                 filter[i*filterSize + j]= coeff;
854                                 xx++;
855                         }
856                         xDstInSrc+= xInc;
857                 }
858         }
859
860         /* apply src & dst Filter to filter -> filter2
861            free(filter);
862         */
863         ASSERT(filterSize>0)
864         filter2Size= filterSize;
865         if(srcFilter) filter2Size+= srcFilter->length - 1;
866         if(dstFilter) filter2Size+= dstFilter->length - 1;
867         ASSERT(filter2Size>0)
868         filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
869
870         for(i=0; i<dstW; i++)
871         {
872                 int j;
873                 SwsVector scaleFilter;
874                 SwsVector *outVec;
875
876                 scaleFilter.coeff= filter + i*filterSize;
877                 scaleFilter.length= filterSize;
878
879                 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
880                 else          outVec= &scaleFilter;
881
882                 ASSERT(outVec->length == filter2Size)
883                 //FIXME dstFilter
884
885                 for(j=0; j<outVec->length; j++)
886                 {
887                         filter2[i*filter2Size + j]= outVec->coeff[j];
888                 }
889
890                 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
891
892                 if(outVec != &scaleFilter) freeVec(outVec);
893         }
894         free(filter); filter=NULL;
895
896         /* try to reduce the filter-size (step1 find size and shift left) */
897         // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
898         minFilterSize= 0;
899         for(i=dstW-1; i>=0; i--)
900         {
901                 int min= filter2Size;
902                 int j;
903                 double cutOff=0.0;
904
905                 /* get rid off near zero elements on the left by shifting left */
906                 for(j=0; j<filter2Size; j++)
907                 {
908                         int k;
909                         cutOff += ABS(filter2[i*filter2Size]);
910
911                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
912
913                         /* preserve Monotonicity because the core cant handle the filter otherwise */
914                         if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
915
916                         // Move filter coeffs left
917                         for(k=1; k<filter2Size; k++)
918                                 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
919                         filter2[i*filter2Size + k - 1]= 0.0;
920                         (*filterPos)[i]++;
921                 }
922
923                 cutOff=0.0;
924                 /* count near zeros on the right */
925                 for(j=filter2Size-1; j>0; j--)
926                 {
927                         cutOff += ABS(filter2[i*filter2Size + j]);
928
929                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
930                         min--;
931                 }
932
933                 if(min>minFilterSize) minFilterSize= min;
934         }
935
936         ASSERT(minFilterSize > 0)
937         filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
938         ASSERT(filterSize > 0)
939         filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
940         *outFilterSize= filterSize;
941
942         if(flags&SWS_PRINT_INFO)
943                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
944         /* try to reduce the filter-size (step2 reduce it) */
945         for(i=0; i<dstW; i++)
946         {
947                 int j;
948
949                 for(j=0; j<filterSize; j++)
950                 {
951                         if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
952                         else               filter[i*filterSize + j]= filter2[i*filter2Size + j];
953                 }
954         }
955         free(filter2); filter2=NULL;
956         
957
958         //FIXME try to align filterpos if possible
959
960         //fix borders
961         for(i=0; i<dstW; i++)
962         {
963                 int j;
964                 if((*filterPos)[i] < 0)
965                 {
966                         // Move filter coeffs left to compensate for filterPos
967                         for(j=1; j<filterSize; j++)
968                         {
969                                 int left= MAX(j + (*filterPos)[i], 0);
970                                 filter[i*filterSize + left] += filter[i*filterSize + j];
971                                 filter[i*filterSize + j]=0;
972                         }
973                         (*filterPos)[i]= 0;
974                 }
975
976                 if((*filterPos)[i] + filterSize > srcW)
977                 {
978                         int shift= (*filterPos)[i] + filterSize - srcW;
979                         // Move filter coeffs right to compensate for filterPos
980                         for(j=filterSize-2; j>=0; j--)
981                         {
982                                 int right= MIN(j + shift, filterSize-1);
983                                 filter[i*filterSize +right] += filter[i*filterSize +j];
984                                 filter[i*filterSize +j]=0;
985                         }
986                         (*filterPos)[i]= srcW - filterSize;
987                 }
988         }
989
990         // Note the +1 is for the MMXscaler which reads over the end
991         *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
992         memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
993
994         /* Normalize & Store in outFilter */
995         for(i=0; i<dstW; i++)
996         {
997                 int j;
998                 double sum=0;
999                 double scale= one;
1000                 for(j=0; j<filterSize; j++)
1001                 {
1002                         sum+= filter[i*filterSize + j];
1003                 }
1004                 scale/= sum;
1005                 for(j=0; j<filterSize; j++)
1006                 {
1007                         (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
1008                 }
1009         }
1010         
1011         (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1012         for(i=0; i<*outFilterSize; i++)
1013         {
1014                 int j= dstW*(*outFilterSize);
1015                 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1016         }
1017
1018         free(filter);
1019 }
1020
1021 #ifdef ARCH_X86
1022 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
1023 {
1024         uint8_t *fragmentA;
1025         int imm8OfPShufW1A;
1026         int imm8OfPShufW2A;
1027         int fragmentLengthA;
1028         uint8_t *fragmentB;
1029         int imm8OfPShufW1B;
1030         int imm8OfPShufW2B;
1031         int fragmentLengthB;
1032         int fragmentPos;
1033
1034         int xpos, i;
1035
1036         // create an optimized horizontal scaling routine
1037
1038         //code fragment
1039
1040         asm volatile(
1041                 "jmp 9f                         \n\t"
1042         // Begin
1043                 "0:                             \n\t"
1044                 "movq (%%edx, %%eax), %%mm3     \n\t" 
1045                 "movd (%%ecx, %%esi), %%mm0     \n\t" 
1046                 "movd 1(%%ecx, %%esi), %%mm1    \n\t"
1047                 "punpcklbw %%mm7, %%mm1         \n\t"
1048                 "punpcklbw %%mm7, %%mm0         \n\t"
1049                 "pshufw $0xFF, %%mm1, %%mm1     \n\t"
1050                 "1:                             \n\t"
1051                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
1052                 "2:                             \n\t"
1053                 "psubw %%mm1, %%mm0             \n\t"
1054                 "movl 8(%%ebx, %%eax), %%esi    \n\t"
1055                 "pmullw %%mm3, %%mm0            \n\t"
1056                 "psllw $7, %%mm1                \n\t"
1057                 "paddw %%mm1, %%mm0             \n\t"
1058
1059                 "movq %%mm0, (%%edi, %%eax)     \n\t"
1060
1061                 "addl $8, %%eax                 \n\t"
1062         // End
1063                 "9:                             \n\t"
1064 //              "int $3\n\t"
1065                 "leal 0b, %0                    \n\t"
1066                 "leal 1b, %1                    \n\t"
1067                 "leal 2b, %2                    \n\t"
1068                 "decl %1                        \n\t"
1069                 "decl %2                        \n\t"
1070                 "subl %0, %1                    \n\t"
1071                 "subl %0, %2                    \n\t"
1072                 "leal 9b, %3                    \n\t"
1073                 "subl %0, %3                    \n\t"
1074
1075
1076                 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1077                 "=r" (fragmentLengthA)
1078         );
1079
1080         asm volatile(
1081                 "jmp 9f                         \n\t"
1082         // Begin
1083                 "0:                             \n\t"
1084                 "movq (%%edx, %%eax), %%mm3     \n\t" 
1085                 "movd (%%ecx, %%esi), %%mm0     \n\t" 
1086                 "punpcklbw %%mm7, %%mm0         \n\t"
1087                 "pshufw $0xFF, %%mm0, %%mm1     \n\t"
1088                 "1:                             \n\t"
1089                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
1090                 "2:                             \n\t"
1091                 "psubw %%mm1, %%mm0             \n\t"
1092                 "movl 8(%%ebx, %%eax), %%esi    \n\t"
1093                 "pmullw %%mm3, %%mm0            \n\t"
1094                 "psllw $7, %%mm1                \n\t"
1095                 "paddw %%mm1, %%mm0             \n\t"
1096
1097                 "movq %%mm0, (%%edi, %%eax)     \n\t"
1098
1099                 "addl $8, %%eax                 \n\t"
1100         // End
1101                 "9:                             \n\t"
1102 //              "int $3\n\t"
1103                 "leal 0b, %0                    \n\t"
1104                 "leal 1b, %1                    \n\t"
1105                 "leal 2b, %2                    \n\t"
1106                 "decl %1                        \n\t"
1107                 "decl %2                        \n\t"
1108                 "subl %0, %1                    \n\t"
1109                 "subl %0, %2                    \n\t"
1110                 "leal 9b, %3                    \n\t"
1111                 "subl %0, %3                    \n\t"
1112
1113
1114                 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1115                 "=r" (fragmentLengthB)
1116         );
1117
1118         xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1119         fragmentPos=0;
1120         
1121         for(i=0; i<dstW/numSplits; i++)
1122         {
1123                 int xx=xpos>>16;
1124
1125                 if((i&3) == 0)
1126                 {
1127                         int a=0;
1128                         int b=((xpos+xInc)>>16) - xx;
1129                         int c=((xpos+xInc*2)>>16) - xx;
1130                         int d=((xpos+xInc*3)>>16) - xx;
1131
1132                         filter[i  ] = (( xpos         & 0xFFFF) ^ 0xFFFF)>>9;
1133                         filter[i+1] = (((xpos+xInc  ) & 0xFFFF) ^ 0xFFFF)>>9;
1134                         filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1135                         filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1136                         filterPos[i/2]= xx;
1137
1138                         if(d+1<4)
1139                         {
1140                                 int maxShift= 3-(d+1);
1141                                 int shift=0;
1142
1143                                 memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
1144
1145                                 funnyCode[fragmentPos + imm8OfPShufW1B]=
1146                                         (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
1147                                 funnyCode[fragmentPos + imm8OfPShufW2B]=
1148                                         a | (b<<2) | (c<<4) | (d<<6);
1149
1150                                 if(i+3>=dstW) shift=maxShift; //avoid overread
1151                                 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1152
1153                                 if(shift && i>=shift)
1154                                 {
1155                                         funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
1156                                         funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
1157                                         filterPos[i/2]-=shift;
1158                                 }
1159
1160                                 fragmentPos+= fragmentLengthB;
1161                         }
1162                         else
1163                         {
1164                                 int maxShift= 3-d;
1165                                 int shift=0;
1166
1167                                 memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
1168
1169                                 funnyCode[fragmentPos + imm8OfPShufW1A]=
1170                                 funnyCode[fragmentPos + imm8OfPShufW2A]=
1171                                         a | (b<<2) | (c<<4) | (d<<6);
1172
1173                                 if(i+4>=dstW) shift=maxShift; //avoid overread
1174                                 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
1175
1176                                 if(shift && i>=shift)
1177                                 {
1178                                         funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
1179                                         funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
1180                                         filterPos[i/2]-=shift;
1181                                 }
1182
1183                                 fragmentPos+= fragmentLengthA;
1184                         }
1185
1186                         funnyCode[fragmentPos]= RET;
1187                 }
1188                 xpos+=xInc;
1189         }
1190         filterPos[i/2]= xpos>>16; // needed to jump to the next part
1191 }
1192 #endif // ARCH_X86
1193
1194 //FIXME remove
1195 void SwScale_Init(){
1196 }
1197
1198 static void globalInit(){
1199     // generating tables:
1200     int i;
1201     for(i=0; i<768; i++){
1202         int c= MIN(MAX(i-256, 0), 255);
1203         clip_table[i]=c;
1204         yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1205         yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1206         yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1207         yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1208         yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1209     }
1210
1211     for(i=0; i<768; i++)
1212     {
1213         int v= clip_table[i];
1214         clip_table16b[i]=  v>>3;
1215         clip_table16g[i]= (v<<3)&0x07E0;
1216         clip_table16r[i]= (v<<8)&0xF800;
1217         clip_table15b[i]=  v>>3;
1218         clip_table15g[i]= (v<<2)&0x03E0;
1219         clip_table15r[i]= (v<<7)&0x7C00;
1220     }
1221
1222 cpuCaps= gCpuCaps;
1223
1224 #ifdef RUNTIME_CPUDETECT
1225 #ifdef CAN_COMPILE_X86_ASM
1226         // ordered per speed fasterst first
1227         if(gCpuCaps.hasMMX2)
1228                 swScale= swScale_MMX2;
1229         else if(gCpuCaps.has3DNow)
1230                 swScale= swScale_3DNow;
1231         else if(gCpuCaps.hasMMX)
1232                 swScale= swScale_MMX;
1233         else
1234                 swScale= swScale_C;
1235
1236 #else
1237         swScale= swScale_C;
1238         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1239 #endif
1240 #else //RUNTIME_CPUDETECT
1241 #ifdef HAVE_MMX2
1242         swScale= swScale_MMX2;
1243         cpuCaps.has3DNow = 0;
1244 #elif defined (HAVE_3DNOW)
1245         swScale= swScale_3DNow;
1246         cpuCaps.hasMMX2 = 0;
1247 #elif defined (HAVE_MMX)
1248         swScale= swScale_MMX;
1249         cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1250 #else
1251         swScale= swScale_C;
1252         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1253 #endif
1254 #endif //!RUNTIME_CPUDETECT
1255 }
1256
1257 /* Warper functions for yuv2bgr */
1258 static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1259              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1260         uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1261
1262         if(c->srcFormat==IMGFMT_YV12)
1263                 yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1264         else /* I420 & IYUV */
1265                 yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1266 }
1267
1268 static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1269              int srcSliceH, uint8_t* dst[], int dstStride[]){
1270         
1271         if(dstStride[0]*3==srcStride[0]*4)
1272                 rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1273         else
1274         {
1275                 int i;
1276                 uint8_t *srcPtr= src[0];
1277                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1278
1279                 for(i=0; i<srcSliceH; i++)
1280                 {
1281                         rgb24to32(srcPtr, dstPtr, c->srcW*3);
1282                         srcPtr+= srcStride[0];
1283                         dstPtr+= dstStride[0];
1284                 }
1285         }     
1286 }
1287
1288 static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1289              int srcSliceH, uint8_t* dst[], int dstStride[]){
1290         
1291         if(dstStride[0]*4==srcStride[0]*3)
1292                 rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1293         else
1294         {
1295                 int i;
1296                 uint8_t *srcPtr= src[0];
1297                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1298
1299                 for(i=0; i<srcSliceH; i++)
1300                 {
1301                         rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1302                         srcPtr+= srcStride[0];
1303                         dstPtr+= dstStride[0];
1304                 }
1305         }     
1306 }
1307
1308 static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1309              int srcSliceH, uint8_t* dst[], int dstStride[]){
1310         
1311         if(dstStride[0]==srcStride[0])
1312                 rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1313         else
1314         {
1315                 int i;
1316                 uint8_t *srcPtr= src[0];
1317                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1318
1319                 for(i=0; i<srcSliceH; i++)
1320                 {
1321                         rgb15to16(srcPtr, dstPtr, c->srcW<<1);
1322                         srcPtr+= srcStride[0];
1323                         dstPtr+= dstStride[0];
1324                 }
1325         }     
1326 }
1327
1328 static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1329              int srcSliceH, uint8_t* dst[], int dstStride[]){
1330
1331         rgb24toyv12(
1332                 src[0], 
1333                 dst[0]+ srcSliceY    *dstStride[0], 
1334                 dst[1]+(srcSliceY>>1)*dstStride[1], 
1335                 dst[2]+(srcSliceY>>1)*dstStride[2],
1336                 c->srcW, srcSliceH, 
1337                 dstStride[0], dstStride[1], srcStride[0]);
1338 }
1339
1340
1341 /* unscaled copy like stuff (assumes nearly identical formats) */
1342 static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1343              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1344
1345         int srcStride[3];
1346         uint8_t *src[3];
1347         uint8_t *dst[3];
1348
1349         if(c->srcFormat == IMGFMT_I420){
1350                 src[0]= srcParam[0];
1351                 src[1]= srcParam[2];
1352                 src[2]= srcParam[1];
1353                 srcStride[0]= srcStrideParam[0];
1354                 srcStride[1]= srcStrideParam[2];
1355                 srcStride[2]= srcStrideParam[1];
1356         }
1357         else if(c->srcFormat==IMGFMT_YV12){
1358                 src[0]= srcParam[0];
1359                 src[1]= srcParam[1];
1360                 src[2]= srcParam[2];
1361                 srcStride[0]= srcStrideParam[0];
1362                 srcStride[1]= srcStrideParam[1];
1363                 srcStride[2]= srcStrideParam[2];
1364         }
1365         else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
1366                 src[0]= srcParam[0];
1367                 src[1]=
1368                 src[2]= NULL;
1369                 srcStride[0]= srcStrideParam[0];
1370                 srcStride[1]=
1371                 srcStride[2]= 0;
1372         }
1373
1374         if(c->dstFormat == IMGFMT_I420){
1375                 dst[0]= dstParam[0];
1376                 dst[1]= dstParam[2];
1377                 dst[2]= dstParam[1];
1378                 
1379         }else{
1380                 dst[0]= dstParam[0];
1381                 dst[1]= dstParam[1];
1382                 dst[2]= dstParam[2];
1383         }
1384
1385         if(isPacked(c->srcFormat))
1386         {
1387                 if(dstStride[0]==srcStride[0])
1388                         memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1389                 else
1390                 {
1391                         int i;
1392                         uint8_t *srcPtr= src[0];
1393                         uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1394                         int length=0;
1395
1396                         /* universal length finder */
1397                         while(length+c->srcW <= ABS(dstStride[0]) 
1398                            && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
1399                         ASSERT(length!=0);
1400
1401                         for(i=0; i<srcSliceH; i++)
1402                         {
1403                                 memcpy(dstPtr, srcPtr, length);
1404                                 srcPtr+= srcStride[0];
1405                                 dstPtr+= dstStride[0];
1406                         }
1407                 }
1408         }
1409         else 
1410         { /* Planar YUV */
1411                 int plane;
1412                 for(plane=0; plane<3; plane++)
1413                 {
1414                         int length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
1415                         int y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1416                         int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
1417
1418                         if(dstStride[plane]==srcStride[plane])
1419                                 memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1420                         else
1421                         {
1422                                 int i;
1423                                 uint8_t *srcPtr= src[plane];
1424                                 uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1425                                 for(i=0; i<height; i++)
1426                                 {
1427                                         memcpy(dstPtr, srcPtr, length);
1428                                         srcPtr+= srcStride[plane];
1429                                         dstPtr+= dstStride[plane];
1430                                 }
1431                         }
1432                 }
1433         }
1434 }
1435
1436 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1437                          SwsFilter *srcFilter, SwsFilter *dstFilter){
1438
1439         SwsContext *c;
1440         int i;
1441         int usesFilter;
1442         SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1443
1444 #ifdef ARCH_X86
1445         if(gCpuCaps.hasMMX)
1446                 asm volatile("emms\n\t"::: "memory");
1447 #endif
1448
1449         if(swScale==NULL) globalInit();
1450
1451         /* avoid dupplicate Formats, so we dont need to check to much */
1452         if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
1453         if(srcFormat==IMGFMT_Y8)   srcFormat=IMGFMT_Y800;
1454         if(dstFormat==IMGFMT_Y8)   dstFormat=IMGFMT_Y800;
1455
1456         if(!isSupportedIn(srcFormat)) 
1457         {
1458                 mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1459                 return NULL;
1460         }
1461         if(!isSupportedOut(dstFormat))
1462         {
1463                  mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1464                 return NULL;
1465         }
1466
1467         /* sanity check */
1468         if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1469         {
1470                  mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %dx%d -> %dx%d is invalid scaling dimension\n", 
1471                         srcW, srcH, dstW, dstH);
1472                 return NULL;
1473         }
1474
1475         if(!dstFilter) dstFilter= &dummyFilter;
1476         if(!srcFilter) srcFilter= &dummyFilter;
1477
1478         c= memalign(64, sizeof(SwsContext));
1479         memset(c, 0, sizeof(SwsContext));
1480
1481         c->srcW= srcW;
1482         c->srcH= srcH;
1483         c->dstW= dstW;
1484         c->dstH= dstH;
1485         c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1486         c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1487         c->flags= flags;
1488         c->dstFormat= dstFormat;
1489         c->srcFormat= srcFormat;
1490
1491         usesFilter=0;
1492         if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1493         if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1494         if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1495         if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1496         if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1497         if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1498         if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1499         if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1500         
1501         /* unscaled special Cases */
1502         if(srcW==dstW && srcH==dstH && !usesFilter)
1503         {
1504                 /* yuv2bgr */
1505                 if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1506                 {
1507                         // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1508 #ifdef WORDS_BIGENDIAN
1509                         if(dstFormat==IMGFMT_BGR32)
1510                                 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1511                         else
1512                                 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1513 #else
1514                         yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1515 #endif
1516                         c->swScale= planarYuvToBgr;
1517
1518                         if(flags&SWS_PRINT_INFO)
1519                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
1520                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1521                         return c;
1522                 }
1523
1524                 /* simple copy */
1525                 if(srcFormat == dstFormat || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)))
1526                 {
1527                         c->swScale= simpleCopy;
1528
1529                         if(flags&SWS_PRINT_INFO)
1530                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
1531                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1532                         return c;
1533                 }
1534                 
1535                 /* bgr32to24 & rgb32to24*/
1536                 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1537                  ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1538                 {
1539                         c->swScale= bgr32to24Wrapper;
1540
1541                         if(flags&SWS_PRINT_INFO)
1542                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
1543                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1544                         return c;
1545                 }
1546                 
1547                 /* bgr24to32 & rgb24to32*/
1548                 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1549                  ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1550                 {
1551                         c->swScale= bgr24to32Wrapper;
1552
1553                         if(flags&SWS_PRINT_INFO)
1554                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
1555                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1556                         return c;
1557                 }
1558
1559                 /* bgr15to16 */
1560                 if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
1561                 {
1562                         c->swScale= bgr15to16Wrapper;
1563
1564                         if(flags&SWS_PRINT_INFO)
1565                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
1566                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1567                         return c;
1568                 }
1569
1570                 /* bgr24toYV12 */
1571                 if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
1572                 {
1573                         c->swScale= bgr24toyv12Wrapper;
1574
1575                         if(flags&SWS_PRINT_INFO)
1576                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n", 
1577                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1578                         return c;
1579                 }
1580         }
1581
1582         if(cpuCaps.hasMMX2)
1583         {
1584                 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1585                 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1586                 {
1587                         if(flags&SWS_PRINT_INFO)
1588                                 mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1589                 }
1590         }
1591         else
1592                 c->canMMX2BeUsed=0;
1593
1594
1595         /* dont use full vertical UV input/internaly if the source doesnt even have it */
1596         if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1597         /* dont use full horizontal UV input if the source doesnt even have it */
1598         if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1599         /* dont use full horizontal UV internally if the destination doesnt even have it */
1600         if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1601
1602         if(flags&SWS_FULL_CHR_H_INP)    c->chrSrcW= srcW;
1603         else                            c->chrSrcW= (srcW+1)>>1;
1604
1605         if(flags&SWS_FULL_CHR_H_INT)    c->chrDstW= dstW;
1606         else                            c->chrDstW= (dstW+1)>>1;
1607
1608         if(flags&SWS_FULL_CHR_V)        c->chrSrcH= srcH;
1609         else                            c->chrSrcH= (srcH+1)>>1;
1610
1611         if(isHalfChrV(dstFormat))       c->chrDstH= (dstH+1)>>1;
1612         else                            c->chrDstH= dstH;
1613
1614         c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1615         c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1616
1617
1618         // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1619         // but only for the FAST_BILINEAR mode otherwise do correct scaling
1620         // n-2 is the last chrominance sample available
1621         // this is not perfect, but noone shuld notice the difference, the more correct variant
1622         // would be like the vertical one, but that would require some special code for the
1623         // first and last pixel
1624         if(flags&SWS_FAST_BILINEAR)
1625         {
1626                 if(c->canMMX2BeUsed)
1627                 {
1628                         c->lumXInc+= 20;
1629                         c->chrXInc+= 20;
1630                 }
1631                 //we dont use the x86asm scaler if mmx is available
1632                 else if(cpuCaps.hasMMX)
1633                 {
1634                         c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1635                         c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1636                 }
1637         }
1638
1639         /* precalculate horizontal scaler filter coefficients */
1640         {
1641                 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1642
1643                 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1644                                  srcW      ,       dstW, filterAlign, 1<<14, flags,
1645                                  srcFilter->lumH, dstFilter->lumH);
1646                 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1647                                 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1648                                  srcFilter->chrH, dstFilter->chrH);
1649
1650 #ifdef ARCH_X86
1651 // cant downscale !!!
1652                 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1653                 {
1654                         c->lumMmx2Filter   = (int16_t*)memalign(8, (dstW        /8+8)*sizeof(int16_t));
1655                         c->chrMmx2Filter   = (int16_t*)memalign(8, (c->chrDstW  /4+8)*sizeof(int16_t));
1656                         c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW      /2/8+8)*sizeof(int32_t));
1657                         c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
1658
1659                         initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
1660                         initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
1661                 }
1662 #endif
1663         } // Init Horizontal stuff
1664
1665
1666
1667         /* precalculate vertical scaler filter coefficients */
1668         initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1669                         srcH      ,        dstH, 1, (1<<12)-4, flags,
1670                         srcFilter->lumV, dstFilter->lumV);
1671         initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1672                         (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1673                          srcFilter->chrV, dstFilter->chrV);
1674
1675         // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1676         c->vLumBufSize= c->vLumFilterSize;
1677         c->vChrBufSize= c->vChrFilterSize;
1678         for(i=0; i<dstH; i++)
1679         {
1680                 int chrI= i*c->chrDstH / dstH;
1681                 int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
1682                                  ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1683                 nextSlice&= ~1; // Slices start at even boundaries
1684                 if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
1685                         c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
1686                 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1687                         c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1688         }
1689
1690         // allocate pixbufs (we use dynamic allocation because otherwise we would need to
1691         c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1692         c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
1693         //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
1694         for(i=0; i<c->vLumBufSize; i++)
1695                 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1696         for(i=0; i<c->vChrBufSize; i++)
1697                 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1698
1699         //try to avoid drawing green stuff between the right end and the stride end
1700         for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1701         for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1702
1703         ASSERT(c->chrDstH <= dstH)
1704
1705         // pack filter data for mmx code
1706         if(cpuCaps.hasMMX)
1707         {
1708                 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
1709                 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
1710                 for(i=0; i<c->vLumFilterSize*dstH; i++)
1711                         c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1712                                 c->vLumFilter[i];
1713                 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1714                         c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1715                                 c->vChrFilter[i];
1716         }
1717
1718         if(flags&SWS_PRINT_INFO)
1719         {
1720 #ifdef DITHER1XBPP
1721                 char *dither= " dithered";
1722 #else
1723                 char *dither= "";
1724 #endif
1725                 if(flags&SWS_FAST_BILINEAR)
1726                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: FAST_BILINEAR scaler, ");
1727                 else if(flags&SWS_BILINEAR)
1728                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: BILINEAR scaler, ");
1729                 else if(flags&SWS_BICUBIC)
1730                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: BICUBIC scaler, ");
1731                 else if(flags&SWS_X)
1732                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: Experimental scaler, ");
1733                 else if(flags&SWS_POINT)
1734                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: Nearest Neighbor / POINT scaler, ");
1735                 else if(flags&SWS_AREA)
1736                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: Area Averageing scaler, ");
1737                 else
1738                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: ehh flags invalid?! ");
1739
1740                 if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
1741                         mp_msg(MSGT_SWS,MSGL_INFO,"from %s to%s %s ", 
1742                                 vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
1743                 else
1744                         mp_msg(MSGT_SWS,MSGL_INFO,"from %s to %s ", 
1745                                 vo_format_name(srcFormat), vo_format_name(dstFormat));
1746
1747                 if(cpuCaps.hasMMX2)
1748                         mp_msg(MSGT_SWS,MSGL_INFO,"using MMX2\n");
1749                 else if(cpuCaps.has3DNow)
1750                         mp_msg(MSGT_SWS,MSGL_INFO,"using 3DNOW\n");
1751                 else if(cpuCaps.hasMMX)
1752                         mp_msg(MSGT_SWS,MSGL_INFO,"using MMX\n");
1753                 else
1754                         mp_msg(MSGT_SWS,MSGL_INFO,"using C\n");
1755         }
1756
1757         if((flags & SWS_PRINT_INFO) && verbose)
1758         {
1759                 if(cpuCaps.hasMMX)
1760                 {
1761                         if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1762                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1763                         else
1764                         {
1765                                 if(c->hLumFilterSize==4)
1766                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1767                                 else if(c->hLumFilterSize==8)
1768                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1769                                 else
1770                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1771
1772                                 if(c->hChrFilterSize==4)
1773                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1774                                 else if(c->hChrFilterSize==8)
1775                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1776                                 else
1777                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1778                         }
1779                 }
1780                 else
1781                 {
1782 #ifdef ARCH_X86
1783                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using X86-Asm scaler for horizontal scaling\n");
1784 #else
1785                         if(flags & SWS_FAST_BILINEAR)
1786                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1787                         else
1788                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using C scaler for horizontal scaling\n");
1789 #endif
1790                 }
1791                 if(isPlanarYUV(dstFormat))
1792                 {
1793                         if(c->vLumFilterSize==1)
1794                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1795                         else
1796                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1797                 }
1798                 else
1799                 {
1800                         if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1801                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1802                                        "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1803                         else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1804                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1805                         else
1806                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1807                 }
1808
1809                 if(dstFormat==IMGFMT_BGR24)
1810                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR24 Converter\n",
1811                                 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1812                 else if(dstFormat==IMGFMT_BGR32)
1813                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1814                 else if(dstFormat==IMGFMT_BGR16)
1815                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1816                 else if(dstFormat==IMGFMT_BGR15)
1817                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1818
1819                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1820         }
1821         if((flags & SWS_PRINT_INFO) && verbose>1)
1822         {
1823                 mp_msg(MSGT_SWS,MSGL_DBG2,"SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1824                         c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
1825                 mp_msg(MSGT_SWS,MSGL_DBG2,"SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1826                         c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
1827         }
1828
1829         c->swScale= swScale;
1830         return c;
1831 }
1832
1833 /**
1834  * returns a normalized gaussian curve used to filter stuff
1835  * quality=3 is high quality, lowwer is lowwer quality
1836  */
1837
1838 SwsVector *getGaussianVec(double variance, double quality){
1839         const int length= (int)(variance*quality + 0.5) | 1;
1840         int i;
1841         double *coeff= memalign(sizeof(double), length*sizeof(double));
1842         double middle= (length-1)*0.5;
1843         SwsVector *vec= malloc(sizeof(SwsVector));
1844
1845         vec->coeff= coeff;
1846         vec->length= length;
1847
1848         for(i=0; i<length; i++)
1849         {
1850                 double dist= i-middle;
1851                 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1852         }
1853
1854         normalizeVec(vec, 1.0);
1855
1856         return vec;
1857 }
1858
1859 SwsVector *getConstVec(double c, int length){
1860         int i;
1861         double *coeff= memalign(sizeof(double), length*sizeof(double));
1862         SwsVector *vec= malloc(sizeof(SwsVector));
1863
1864         vec->coeff= coeff;
1865         vec->length= length;
1866
1867         for(i=0; i<length; i++)
1868                 coeff[i]= c;
1869
1870         return vec;
1871 }
1872
1873
1874 SwsVector *getIdentityVec(void){
1875         double *coeff= memalign(sizeof(double), sizeof(double));
1876         SwsVector *vec= malloc(sizeof(SwsVector));
1877         coeff[0]= 1.0;
1878
1879         vec->coeff= coeff;
1880         vec->length= 1;
1881
1882         return vec;
1883 }
1884
1885 void normalizeVec(SwsVector *a, double height){
1886         int i;
1887         double sum=0;
1888         double inv;
1889
1890         for(i=0; i<a->length; i++)
1891                 sum+= a->coeff[i];
1892
1893         inv= height/sum;
1894
1895         for(i=0; i<a->length; i++)
1896                 a->coeff[i]*= height;
1897 }
1898
1899 void scaleVec(SwsVector *a, double scalar){
1900         int i;
1901
1902         for(i=0; i<a->length; i++)
1903                 a->coeff[i]*= scalar;
1904 }
1905
1906 static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
1907         int length= a->length + b->length - 1;
1908         double *coeff= memalign(sizeof(double), length*sizeof(double));
1909         int i, j;
1910         SwsVector *vec= malloc(sizeof(SwsVector));
1911
1912         vec->coeff= coeff;
1913         vec->length= length;
1914
1915         for(i=0; i<length; i++) coeff[i]= 0.0;
1916
1917         for(i=0; i<a->length; i++)
1918         {
1919                 for(j=0; j<b->length; j++)
1920                 {
1921                         coeff[i+j]+= a->coeff[i]*b->coeff[j];
1922                 }
1923         }
1924
1925         return vec;
1926 }
1927
1928 static SwsVector *sumVec(SwsVector *a, SwsVector *b){
1929         int length= MAX(a->length, b->length);
1930         double *coeff= memalign(sizeof(double), length*sizeof(double));
1931         int i;
1932         SwsVector *vec= malloc(sizeof(SwsVector));
1933
1934         vec->coeff= coeff;
1935         vec->length= length;
1936
1937         for(i=0; i<length; i++) coeff[i]= 0.0;
1938
1939         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1940         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1941
1942         return vec;
1943 }
1944
1945 static SwsVector *diffVec(SwsVector *a, SwsVector *b){
1946         int length= MAX(a->length, b->length);
1947         double *coeff= memalign(sizeof(double), length*sizeof(double));
1948         int i;
1949         SwsVector *vec= malloc(sizeof(SwsVector));
1950
1951         vec->coeff= coeff;
1952         vec->length= length;
1953
1954         for(i=0; i<length; i++) coeff[i]= 0.0;
1955
1956         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1957         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1958
1959         return vec;
1960 }
1961
1962 /* shift left / or right if "shift" is negative */
1963 static SwsVector *getShiftedVec(SwsVector *a, int shift){
1964         int length= a->length + ABS(shift)*2;
1965         double *coeff= memalign(sizeof(double), length*sizeof(double));
1966         int i;
1967         SwsVector *vec= malloc(sizeof(SwsVector));
1968
1969         vec->coeff= coeff;
1970         vec->length= length;
1971
1972         for(i=0; i<length; i++) coeff[i]= 0.0;
1973
1974         for(i=0; i<a->length; i++)
1975         {
1976                 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1977         }
1978
1979         return vec;
1980 }
1981
1982 void shiftVec(SwsVector *a, int shift){
1983         SwsVector *shifted= getShiftedVec(a, shift);
1984         free(a->coeff);
1985         a->coeff= shifted->coeff;
1986         a->length= shifted->length;
1987         free(shifted);
1988 }
1989
1990 void addVec(SwsVector *a, SwsVector *b){
1991         SwsVector *sum= sumVec(a, b);
1992         free(a->coeff);
1993         a->coeff= sum->coeff;
1994         a->length= sum->length;
1995         free(sum);
1996 }
1997
1998 void subVec(SwsVector *a, SwsVector *b){
1999         SwsVector *diff= diffVec(a, b);
2000         free(a->coeff);
2001         a->coeff= diff->coeff;
2002         a->length= diff->length;
2003         free(diff);
2004 }
2005
2006 void convVec(SwsVector *a, SwsVector *b){
2007         SwsVector *conv= getConvVec(a, b);
2008         free(a->coeff);
2009         a->coeff= conv->coeff;
2010         a->length= conv->length;
2011         free(conv);
2012 }
2013
2014 SwsVector *cloneVec(SwsVector *a){
2015         double *coeff= memalign(sizeof(double), a->length*sizeof(double));
2016         int i;
2017         SwsVector *vec= malloc(sizeof(SwsVector));
2018
2019         vec->coeff= coeff;
2020         vec->length= a->length;
2021
2022         for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
2023
2024         return vec;
2025 }
2026
2027 void printVec(SwsVector *a){
2028         int i;
2029         double max=0;
2030         double min=0;
2031         double range;
2032
2033         for(i=0; i<a->length; i++)
2034                 if(a->coeff[i]>max) max= a->coeff[i];
2035
2036         for(i=0; i<a->length; i++)
2037                 if(a->coeff[i]<min) min= a->coeff[i];
2038
2039         range= max - min;
2040
2041         for(i=0; i<a->length; i++)
2042         {
2043                 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
2044                 printf("%1.3f ", a->coeff[i]);
2045                 for(;x>0; x--) printf(" ");
2046                 printf("|\n");
2047         }
2048 }
2049
2050 void freeVec(SwsVector *a){
2051         if(!a) return;
2052         if(a->coeff) free(a->coeff);
2053         a->coeff=NULL;
2054         a->length=0;
2055         free(a);
2056 }
2057
2058 void freeSwsContext(SwsContext *c){
2059         int i;
2060
2061         if(!c) return;
2062
2063         if(c->lumPixBuf)
2064         {
2065                 for(i=0; i<c->vLumBufSize; i++)
2066                 {
2067                         if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
2068                         c->lumPixBuf[i]=NULL;
2069                 }
2070                 free(c->lumPixBuf);
2071                 c->lumPixBuf=NULL;
2072         }
2073
2074         if(c->chrPixBuf)
2075         {
2076                 for(i=0; i<c->vChrBufSize; i++)
2077                 {
2078                         if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
2079                         c->chrPixBuf[i]=NULL;
2080                 }
2081                 free(c->chrPixBuf);
2082                 c->chrPixBuf=NULL;
2083         }
2084
2085         if(c->vLumFilter) free(c->vLumFilter);
2086         c->vLumFilter = NULL;
2087         if(c->vChrFilter) free(c->vChrFilter);
2088         c->vChrFilter = NULL;
2089         if(c->hLumFilter) free(c->hLumFilter);
2090         c->hLumFilter = NULL;
2091         if(c->hChrFilter) free(c->hChrFilter);
2092         c->hChrFilter = NULL;
2093
2094         if(c->vLumFilterPos) free(c->vLumFilterPos);
2095         c->vLumFilterPos = NULL;
2096         if(c->vChrFilterPos) free(c->vChrFilterPos);
2097         c->vChrFilterPos = NULL;
2098         if(c->hLumFilterPos) free(c->hLumFilterPos);
2099         c->hLumFilterPos = NULL;
2100         if(c->hChrFilterPos) free(c->hChrFilterPos);
2101         c->hChrFilterPos = NULL;
2102
2103         if(c->lumMmxFilter) free(c->lumMmxFilter);
2104         c->lumMmxFilter = NULL;
2105         if(c->chrMmxFilter) free(c->chrMmxFilter);
2106         c->chrMmxFilter = NULL;
2107
2108         if(c->lumMmx2Filter) free(c->lumMmx2Filter);
2109         c->lumMmx2Filter=NULL;
2110         if(c->chrMmx2Filter) free(c->chrMmx2Filter);
2111         c->chrMmx2Filter=NULL;
2112         if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
2113         c->lumMmx2FilterPos=NULL;
2114         if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
2115         c->chrMmx2FilterPos=NULL;
2116
2117         free(c);
2118 }
2119
2120