git.sesse.net Git - ffmpeg/blob - postproc/swscale.c

   1 /*
   2     Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17 */
  18
  19 /*
  20   supported Input formats: YV12, I420, IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8, Y800
  21   supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
  22   BGR15/16 support dithering
  23
  24   unscaled special converters
  25   YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
  26   YV12/I420/IYUV -> YV12/I420/IYUV
  27   YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
  28   BGR24 -> BGR32 & RGB24 -> RGB32
  29   BGR32 -> BGR24 & RGB32 -> RGB24
  30   BGR15 -> BGR16
  31 */
  32
  33 /*
  34 tested special converters
  35  YV12/I420 -> BGR16
  36  YV12 -> YV12
  37  BGR15 -> BGR16
  38  BGR16 -> BGR16
  39
  40 untested special converters
  41   YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
  42   YV12/I420 -> YV12/I420
  43   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  44   BGR24 -> BGR32 & RGB24 -> RGB32
  45   BGR32 -> BGR24 & RGB32 -> RGB24
  46   BGR24 -> YV12
  47 */
  48
  49 #include <inttypes.h>
  50 #include <string.h>
  51 #include <math.h>
  52 #include <stdio.h>
  53 #include "../config.h"
  54 #include "../mangle.h"
  55 #include <assert.h>
  56 #ifdef HAVE_MALLOC_H
  57 #include <malloc.h>
  58 #else
  59 #include <stdlib.h>
  60 #endif
  61 #include "swscale.h"
  62 #include "../cpudetect.h"
  63 #include "../bswap.h"
  64 #include "../libvo/img_format.h"
  65 #include "rgb2rgb.h"
  66 #include "../libvo/fastmemcpy.h"
  67 #include "../mp_msg.h"
  68 #undef MOVNTQ
  69 #undef PAVGB
  70
  71 //#undef HAVE_MMX2
  72 //#define HAVE_3DNOW
  73 //#undef HAVE_MMX
  74 //#undef ARCH_X86
  75 //#define WORDS_BIGENDIAN
  76 #define DITHER1XBPP
  77
  78 #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
  79
  80 #define RET 0xC3 //near return opcode for X86
  81
  82 #ifdef MP_DEBUG
  83 #define ASSERT(x) assert(x);
  84 #else
  85 #define ASSERT(x) ;
  86 #endif
  87
  88 #ifdef M_PI
  89 #define PI M_PI
  90 #else
  91 #define PI 3.14159265358979323846
  92 #endif
  93
  94 //FIXME replace this with something faster
  95 #define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
  96 #define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
  97 #define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
  98 #define isHalfChrH(x)  ((x)==IMGFMT_YUY2 || (x)==IMGFMT_YV12 || (x)==IMGFMT_I420)
  99 #define isPacked(x)    ((x)==IMGFMT_YUY2 || ((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR || ((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
 100 #define isGray(x)      ((x)==IMGFMT_Y800)
 101 #define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
 102                         || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
 103                         || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
 104                         || (x)==IMGFMT_Y800)
 105 #define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
 106                         || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
 107 #define isBGR(x)       ((x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15)
 108
 109 #define RGB2YUV_SHIFT 16
 110 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
 111 #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
 112 #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
 113 #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
 114 #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
 115 #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
 116 #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
 117 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
 118 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
 119
 120 extern int verbose; // defined in mplayer.c
 121 /*
 122 NOTES
 123 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
 124
 125 TODO
 126 more intelligent missalignment avoidance for the horizontal scaler
 127 write special vertical cubic upscale version
 128 Optimize C code (yv12 / minmax)
 129 add support for packed pixel yuv input & output
 130 add support for Y8 output
 131 optimize bgr24 & bgr32
 132 add BGR4 output support
 133 write special BGR->BGR scaler
 134 deglobalize yuv2rgb*.c
 135 */
 136
 137 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
 138 #define MIN(a,b) ((a) > (b) ? (b) : (a))
 139 #define MAX(a,b) ((a) < (b) ? (b) : (a))
 140
 141 #ifdef ARCH_X86
 142 #define CAN_COMPILE_X86_ASM
 143 #endif
 144
 145 #ifdef CAN_COMPILE_X86_ASM
 146 static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
 147 static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
 148 static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
 149 static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
 150 static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
 151 static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
 152 static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
 153 static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
 154 static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
 155 static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
 156 static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
 157 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
 158 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
 159 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
 160 static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
 161
 162 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
 163 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
 164 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
 165 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
 166
 167 static uint64_t __attribute__((aligned(8))) dither4[2]={
 168         0x0103010301030103LL,
 169         0x0200020002000200LL,};
 170
 171 static uint64_t __attribute__((aligned(8))) dither8[2]={
 172         0x0602060206020602LL,
 173         0x0004000400040004LL,};
 174
 175 static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
 176 static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
 177 static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
 178 static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
 179 static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
 180 static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
 181
 182 static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
 183 static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
 184 static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
 185
 186 #ifdef FAST_BGR2YV12
 187 static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
 188 static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
 189 static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
 190 #else
 191 static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
 192 static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
 193 static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
 194 #endif
 195 static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
 196 static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
 197 static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
 198
 199 // FIXME remove
 200 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
 201 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
 202 #endif
 203
 204 // clipping helper table for C implementations:
 205 static unsigned char clip_table[768];
 206
 207 static unsigned short clip_table16b[768];
 208 static unsigned short clip_table16g[768];
 209 static unsigned short clip_table16r[768];
 210 static unsigned short clip_table15b[768];
 211 static unsigned short clip_table15g[768];
 212 static unsigned short clip_table15r[768];
 213
 214 // yuv->rgb conversion tables:
 215 static    int yuvtab_2568[256];
 216 static    int yuvtab_3343[256];
 217 static    int yuvtab_0c92[256];
 218 static    int yuvtab_1a1e[256];
 219 static    int yuvtab_40cf[256];
 220 // Needed for cubic scaler to catch overflows
 221 static    int clip_yuvtab_2568[768];
 222 static    int clip_yuvtab_3343[768];
 223 static    int clip_yuvtab_0c92[768];
 224 static    int clip_yuvtab_1a1e[768];
 225 static    int clip_yuvtab_40cf[768];
 226
 227 //global sws_flags from the command line
 228 int sws_flags=2;
 229
 230 //global srcFilter
 231 SwsFilter src_filter= {NULL, NULL, NULL, NULL};
 232
 233 float sws_lum_gblur= 0.0;
 234 float sws_chr_gblur= 0.0;
 235 int sws_chr_vshift= 0;
 236 int sws_chr_hshift= 0;
 237 float sws_chr_sharpen= 0.0;
 238 float sws_lum_sharpen= 0.0;
 239
 240 /* cpuCaps combined from cpudetect and whats actually compiled in
 241    (if there is no support for something compiled in it wont appear here) */
 242 static CpuCaps cpuCaps;
 243
 244 void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
 245              int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
 246
 247 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
 248
 249 #ifdef CAN_COMPILE_X86_ASM
 250 void in_asm_used_var_warning_killer()
 251 {
 252  volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
 253  bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
 254  M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
 255  if(i) i=0;
 256 }
 257 #endif
 258
 259 static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 260                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 261                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
 262 {
 263         //FIXME Optimize (just quickly writen not opti..)
 264         int i;
 265         for(i=0; i<dstW; i++)
 266         {
 267                 int val=0;
 268                 int j;
 269                 for(j=0; j<lumFilterSize; j++)
 270                         val += lumSrc[j][i] * lumFilter[j];
 271
 272                 dest[i]= MIN(MAX(val>>19, 0), 255);
 273         }
 274
 275         if(uDest != NULL)
 276                 for(i=0; i<(dstW>>1); i++)
 277                 {
 278                         int u=0;
 279                         int v=0;
 280                         int j;
 281                         for(j=0; j<chrFilterSize; j++)
 282                         {
 283                                 u += chrSrc[j][i] * chrFilter[j];
 284                                 v += chrSrc[j][i + 2048] * chrFilter[j];
 285                         }
 286
 287                         uDest[i]= MIN(MAX(u>>19, 0), 255);
 288                         vDest[i]= MIN(MAX(v>>19, 0), 255);
 289                 }
 290 }
 291
 292 static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 293                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 294                                     uint8_t *dest, int dstW, int dstFormat)
 295 {
 296         if(dstFormat==IMGFMT_BGR32)
 297         {
 298                 int i;
 299 #ifdef WORDS_BIGENDIAN
 300         dest++;
 301 #endif
 302                 for(i=0; i<(dstW>>1); i++){
 303                         int j;
 304                         int Y1=0;
 305                         int Y2=0;
 306                         int U=0;
 307                         int V=0;
 308                         int Cb, Cr, Cg;
 309                         for(j=0; j<lumFilterSize; j++)
 310                         {
 311                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 312                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 313                         }
 314                         for(j=0; j<chrFilterSize; j++)
 315                         {
 316                                 U += chrSrc[j][i] * chrFilter[j];
 317                                 V += chrSrc[j][i+2048] * chrFilter[j];
 318                         }
 319                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 320                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 321                         U >>= 19;
 322                         V >>= 19;
 323
 324                         Cb= clip_yuvtab_40cf[U+ 256];
 325                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 326                         Cr= clip_yuvtab_3343[V+ 256];
 327
 328                         dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
 329                         dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
 330                         dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
 331
 332                         dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
 333                         dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
 334                         dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
 335                 }
 336         }
 337         else if(dstFormat==IMGFMT_BGR24)
 338         {
 339                 int i;
 340                 for(i=0; i<(dstW>>1); i++){
 341                         int j;
 342                         int Y1=0;
 343                         int Y2=0;
 344                         int U=0;
 345                         int V=0;
 346                         int Cb, Cr, Cg;
 347                         for(j=0; j<lumFilterSize; j++)
 348                         {
 349                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 350                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 351                         }
 352                         for(j=0; j<chrFilterSize; j++)
 353                         {
 354                                 U += chrSrc[j][i] * chrFilter[j];
 355                                 V += chrSrc[j][i+2048] * chrFilter[j];
 356                         }
 357                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 358                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 359                         U >>= 19;
 360                         V >>= 19;
 361
 362                         Cb= clip_yuvtab_40cf[U+ 256];
 363                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 364                         Cr= clip_yuvtab_3343[V+ 256];
 365
 366                         dest[0]=clip_table[((Y1 + Cb) >>13)];
 367                         dest[1]=clip_table[((Y1 + Cg) >>13)];
 368                         dest[2]=clip_table[((Y1 + Cr) >>13)];
 369
 370                         dest[3]=clip_table[((Y2 + Cb) >>13)];
 371                         dest[4]=clip_table[((Y2 + Cg) >>13)];
 372                         dest[5]=clip_table[((Y2 + Cr) >>13)];
 373                         dest+=6;
 374                 }
 375         }
 376         else if(dstFormat==IMGFMT_BGR16)
 377         {
 378                 int i;
 379 #ifdef DITHER1XBPP
 380                 static int ditherb1=1<<14;
 381                 static int ditherg1=1<<13;
 382                 static int ditherr1=2<<14;
 383                 static int ditherb2=3<<14;
 384                 static int ditherg2=3<<13;
 385                 static int ditherr2=0<<14;
 386
 387                 ditherb1 ^= (1^2)<<14;
 388                 ditherg1 ^= (1^2)<<13;
 389                 ditherr1 ^= (1^2)<<14;
 390                 ditherb2 ^= (3^0)<<14;
 391                 ditherg2 ^= (3^0)<<13;
 392                 ditherr2 ^= (3^0)<<14;
 393 #else
 394                 const int ditherb1=0;
 395                 const int ditherg1=0;
 396                 const int ditherr1=0;
 397                 const int ditherb2=0;
 398                 const int ditherg2=0;
 399                 const int ditherr2=0;
 400 #endif
 401                 for(i=0; i<(dstW>>1); i++){
 402                         int j;
 403                         int Y1=0;
 404                         int Y2=0;
 405                         int U=0;
 406                         int V=0;
 407                         int Cb, Cr, Cg;
 408                         for(j=0; j<lumFilterSize; j++)
 409                         {
 410                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 411                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 412                         }
 413                         for(j=0; j<chrFilterSize; j++)
 414                         {
 415                                 U += chrSrc[j][i] * chrFilter[j];
 416                                 V += chrSrc[j][i+2048] * chrFilter[j];
 417                         }
 418                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 419                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 420                         U >>= 19;
 421                         V >>= 19;
 422
 423                         Cb= clip_yuvtab_40cf[U+ 256];
 424                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 425                         Cr= clip_yuvtab_3343[V+ 256];
 426
 427                         ((uint16_t*)dest)[2*i] =
 428                                 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
 429                                 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
 430                                 clip_table16r[(Y1 + Cr + ditherr1) >>13];
 431
 432                         ((uint16_t*)dest)[2*i+1] =
 433                                 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
 434                                 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
 435                                 clip_table16r[(Y2 + Cr + ditherr2) >>13];
 436                 }
 437         }
 438         else if(dstFormat==IMGFMT_BGR15)
 439         {
 440                 int i;
 441 #ifdef DITHER1XBPP
 442                 static int ditherb1=1<<14;
 443                 static int ditherg1=1<<14;
 444                 static int ditherr1=2<<14;
 445                 static int ditherb2=3<<14;
 446                 static int ditherg2=3<<14;
 447                 static int ditherr2=0<<14;
 448
 449                 ditherb1 ^= (1^2)<<14;
 450                 ditherg1 ^= (1^2)<<14;
 451                 ditherr1 ^= (1^2)<<14;
 452                 ditherb2 ^= (3^0)<<14;
 453                 ditherg2 ^= (3^0)<<14;
 454                 ditherr2 ^= (3^0)<<14;
 455 #else
 456                 const int ditherb1=0;
 457                 const int ditherg1=0;
 458                 const int ditherr1=0;
 459                 const int ditherb2=0;
 460                 const int ditherg2=0;
 461                 const int ditherr2=0;
 462 #endif
 463                 for(i=0; i<(dstW>>1); i++){
 464                         int j;
 465                         int Y1=0;
 466                         int Y2=0;
 467                         int U=0;
 468                         int V=0;
 469                         int Cb, Cr, Cg;
 470                         for(j=0; j<lumFilterSize; j++)
 471                         {
 472                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 473                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 474                         }
 475                         for(j=0; j<chrFilterSize; j++)
 476                         {
 477                                 U += chrSrc[j][i] * chrFilter[j];
 478                                 V += chrSrc[j][i+2048] * chrFilter[j];
 479                         }
 480                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 481                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 482                         U >>= 19;
 483                         V >>= 19;
 484
 485                         Cb= clip_yuvtab_40cf[U+ 256];
 486                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 487                         Cr= clip_yuvtab_3343[V+ 256];
 488
 489                         ((uint16_t*)dest)[2*i] =
 490                                 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
 491                                 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
 492                                 clip_table15r[(Y1 + Cr + ditherr1) >>13];
 493
 494                         ((uint16_t*)dest)[2*i+1] =
 495                                 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
 496                                 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
 497                                 clip_table15r[(Y2 + Cr + ditherr2) >>13];
 498                 }
 499         }
 500 }
 501
 502
 503 //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
 504 //Plain C versions
 505 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
 506 #define COMPILE_C
 507 #endif
 508
 509 #ifdef CAN_COMPILE_X86_ASM
 510
 511 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 512 #define COMPILE_MMX
 513 #endif
 514
 515 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
 516 #define COMPILE_MMX2
 517 #endif
 518
 519 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 520 #define COMPILE_3DNOW
 521 #endif
 522 #endif //CAN_COMPILE_X86_ASM
 523
 524 #undef HAVE_MMX
 525 #undef HAVE_MMX2
 526 #undef HAVE_3DNOW
 527
 528 #ifdef COMPILE_C
 529 #undef HAVE_MMX
 530 #undef HAVE_MMX2
 531 #undef HAVE_3DNOW
 532 #define RENAME(a) a ## _C
 533 #include "swscale_template.c"
 534 #endif
 535
 536 #ifdef CAN_COMPILE_X86_ASM
 537
 538 //X86 versions
 539 /*
 540 #undef RENAME
 541 #undef HAVE_MMX
 542 #undef HAVE_MMX2
 543 #undef HAVE_3DNOW
 544 #define ARCH_X86
 545 #define RENAME(a) a ## _X86
 546 #include "swscale_template.c"
 547 */
 548 //MMX versions
 549 #ifdef COMPILE_MMX
 550 #undef RENAME
 551 #define HAVE_MMX
 552 #undef HAVE_MMX2
 553 #undef HAVE_3DNOW
 554 #define RENAME(a) a ## _MMX
 555 #include "swscale_template.c"
 556 #endif
 557
 558 //MMX2 versions
 559 #ifdef COMPILE_MMX2
 560 #undef RENAME
 561 #define HAVE_MMX
 562 #define HAVE_MMX2
 563 #undef HAVE_3DNOW
 564 #define RENAME(a) a ## _MMX2
 565 #include "swscale_template.c"
 566 #endif
 567
 568 //3DNOW versions
 569 #ifdef COMPILE_3DNOW
 570 #undef RENAME
 571 #define HAVE_MMX
 572 #undef HAVE_MMX2
 573 #define HAVE_3DNOW
 574 #define RENAME(a) a ## _3DNow
 575 #include "swscale_template.c"
 576 #endif
 577
 578 #endif //CAN_COMPILE_X86_ASM
 579
 580 // minor note: the HAVE_xyz is messed up after that line so dont use it
 581
 582
 583 // old global scaler, dont use for new code
 584 // will use sws_flags from the command line
 585 void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
 586                              int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
 587                              int srcW, int srcH, int dstW, int dstH){
 588
 589         static SwsContext *context=NULL;
 590         int dstFormat;
 591         int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
 592
 593         switch(dstbpp)
 594         {
 595                 case 8 : dstFormat= IMGFMT_Y8;          break;
 596                 case 12: dstFormat= IMGFMT_YV12;        break;
 597                 case 15: dstFormat= IMGFMT_BGR15;       break;
 598                 case 16: dstFormat= IMGFMT_BGR16;       break;
 599                 case 24: dstFormat= IMGFMT_BGR24;       break;
 600                 case 32: dstFormat= IMGFMT_BGR32;       break;
 601                 default: return;
 602         }
 603
 604         if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
 605
 606         context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
 607 }
 608
 609 // will use sws_flags & src_filter (from cmd line)
 610 SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
 611 {
 612         int flags=0;
 613         static int firstTime=1;
 614
 615 #ifdef ARCH_X86
 616         if(gCpuCaps.hasMMX)
 617                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
 618 #endif
 619         if(firstTime)
 620         {
 621                 firstTime=0;
 622                 flags= SWS_PRINT_INFO;
 623         }
 624         else if(verbose>1) flags= SWS_PRINT_INFO;
 625
 626         if(src_filter.lumH) freeVec(src_filter.lumH);
 627         if(src_filter.lumV) freeVec(src_filter.lumV);
 628         if(src_filter.chrH) freeVec(src_filter.chrH);
 629         if(src_filter.chrV) freeVec(src_filter.chrV);
 630
 631         if(sws_lum_gblur!=0.0){
 632                 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
 633                 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
 634         }else{
 635                 src_filter.lumH= getIdentityVec();
 636                 src_filter.lumV= getIdentityVec();
 637         }
 638
 639         if(sws_chr_gblur!=0.0){
 640                 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
 641                 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
 642         }else{
 643                 src_filter.chrH= getIdentityVec();
 644                 src_filter.chrV= getIdentityVec();
 645         }
 646
 647         if(sws_chr_sharpen!=0.0){
 648                 SwsVector *g= getConstVec(-1.0, 3);
 649                 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
 650                 g->coeff[1]=2.0;
 651                 addVec(id, g);
 652                 convVec(src_filter.chrH, id);
 653                 convVec(src_filter.chrV, id);
 654                 freeVec(g);
 655                 freeVec(id);
 656         }
 657
 658         if(sws_lum_sharpen!=0.0){
 659                 SwsVector *g= getConstVec(-1.0, 3);
 660                 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
 661                 g->coeff[1]=2.0;
 662                 addVec(id, g);
 663                 convVec(src_filter.lumH, id);
 664                 convVec(src_filter.lumV, id);
 665                 freeVec(g);
 666                 freeVec(id);
 667         }
 668
 669         if(sws_chr_hshift)
 670                 shiftVec(src_filter.chrH, sws_chr_hshift);
 671
 672         if(sws_chr_vshift)
 673                 shiftVec(src_filter.chrV, sws_chr_vshift);
 674
 675         normalizeVec(src_filter.chrH, 1.0);
 676         normalizeVec(src_filter.chrV, 1.0);
 677         normalizeVec(src_filter.lumH, 1.0);
 678         normalizeVec(src_filter.lumV, 1.0);
 679
 680         if(verbose > 1) printVec(src_filter.chrH);
 681         if(verbose > 1) printVec(src_filter.lumH);
 682
 683         switch(sws_flags)
 684         {
 685                 case 0: flags|= SWS_FAST_BILINEAR; break;
 686                 case 1: flags|= SWS_BILINEAR; break;
 687                 case 2: flags|= SWS_BICUBIC; break;
 688                 case 3: flags|= SWS_X; break;
 689                 case 4: flags|= SWS_POINT; break;
 690                 case 5: flags|= SWS_AREA; break;
 691                 default:flags|= SWS_BILINEAR; break;
 692         }
 693
 694         return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
 695 }
 696
 697
 698 static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
 699                               int srcW, int dstW, int filterAlign, int one, int flags,
 700                               SwsVector *srcFilter, SwsVector *dstFilter)
 701 {
 702         int i;
 703         int filterSize;
 704         int filter2Size;
 705         int minFilterSize;
 706         double *filter=NULL;
 707         double *filter2=NULL;
 708 #ifdef ARCH_X86
 709         if(gCpuCaps.hasMMX)
 710                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
 711 #endif
 712
 713         // Note the +1 is for the MMXscaler which reads over the end
 714         *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
 715
 716         if(ABS(xInc - 0x10000) <10) // unscaled
 717         {
 718                 int i;
 719                 filterSize= 1;
 720                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 721                 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
 722
 723                 for(i=0; i<dstW; i++)
 724                 {
 725                         filter[i*filterSize]=1;
 726                         (*filterPos)[i]=i;
 727                 }
 728
 729         }
 730         else if(flags&SWS_POINT) // lame looking point sampling mode
 731         {
 732                 int i;
 733                 int xDstInSrc;
 734                 filterSize= 1;
 735                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 736
 737                 xDstInSrc= xInc/2 - 0x8000;
 738                 for(i=0; i<dstW; i++)
 739                 {
 740                         int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
 741
 742                         (*filterPos)[i]= xx;
 743                         filter[i]= 1.0;
 744                         xDstInSrc+= xInc;
 745                 }
 746         }
 747         else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
 748         {
 749                 int i;
 750                 int xDstInSrc;
 751                 if     (flags&SWS_BICUBIC) filterSize= 4;
 752                 else if(flags&SWS_X      ) filterSize= 4;
 753                 else                       filterSize= 2; // SWS_BILINEAR / SWS_AREA
 754 //              printf("%d %d %d\n", filterSize, srcW, dstW);
 755                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 756
 757                 xDstInSrc= xInc/2 - 0x8000;
 758                 for(i=0; i<dstW; i++)
 759                 {
 760                         int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
 761                         int j;
 762
 763                         (*filterPos)[i]= xx;
 764                         if((flags & SWS_BICUBIC) || (flags & SWS_X))
 765                         {
 766                                 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
 767                                 double y1,y2,y3,y4;
 768                                 double A= -0.6;
 769                                 if(flags & SWS_BICUBIC){
 770                                                 // Equation is from VirtualDub
 771                                         y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
 772                                         y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
 773                                         y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
 774                                         y4 = (                  +           A*d*d -       A*d*d*d);
 775                                 }else{
 776                                                 // cubic interpolation (derived it myself)
 777                                         y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
 778                                         y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
 779                                         y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
 780                                         y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
 781                                 }
 782
 783 //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
 784                                 filter[i*filterSize + 0]= y1;
 785                                 filter[i*filterSize + 1]= y2;
 786                                 filter[i*filterSize + 2]= y3;
 787                                 filter[i*filterSize + 3]= y4;
 788 //                              printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
 789                         }
 790                         else
 791                         {
 792                                 //Bilinear upscale / linear interpolate / Area averaging
 793                                 for(j=0; j<filterSize; j++)
 794                                 {
 795                                         double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
 796                                         double coeff= 1.0 - d;
 797                                         if(coeff<0) coeff=0;
 798         //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
 799                                         filter[i*filterSize + j]= coeff;
 800                                         xx++;
 801                                 }
 802                         }
 803                         xDstInSrc+= xInc;
 804                 }
 805         }
 806         else // downscale
 807         {
 808                 int xDstInSrc;
 809                 ASSERT(dstW <= srcW)
 810
 811                 if(flags&SWS_BICUBIC)   filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
 812                 else if(flags&SWS_X)    filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
 813                 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
 814                 else /* BILINEAR */     filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
 815 //              printf("%d %d %d\n", *filterSize, srcW, dstW);
 816                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 817
 818                 xDstInSrc= xInc/2 - 0x8000;
 819                 for(i=0; i<dstW; i++)
 820                 {
 821                         int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
 822                         int j;
 823                         (*filterPos)[i]= xx;
 824                         for(j=0; j<filterSize; j++)
 825                         {
 826                                 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
 827                                 double coeff;
 828                                 if((flags & SWS_BICUBIC) || (flags & SWS_X))
 829                                 {
 830                                         double A= -0.75;
 831 //                                      d*=2;
 832                                         // Equation is from VirtualDub
 833                                         if(d<1.0)
 834                                                 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
 835                                         else if(d<2.0)
 836                                                 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
 837                                         else
 838                                                 coeff=0.0;
 839                                 }
 840                                 else if(flags & SWS_AREA)
 841                                 {
 842                                         double srcPixelSize= (1<<16)/(double)xInc;
 843                                         if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
 844                                         else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
 845                                         else coeff=0.0;
 846                                 }
 847                                 else
 848                                 {
 849                                         coeff= 1.0 - d;
 850                                         if(coeff<0) coeff=0;
 851                                 }
 852 //                              printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
 853                                 filter[i*filterSize + j]= coeff;
 854                                 xx++;
 855                         }
 856                         xDstInSrc+= xInc;
 857                 }
 858         }
 859
 860         /* apply src & dst Filter to filter -> filter2
 861            free(filter);
 862         */
 863         ASSERT(filterSize>0)
 864         filter2Size= filterSize;
 865         if(srcFilter) filter2Size+= srcFilter->length - 1;
 866         if(dstFilter) filter2Size+= dstFilter->length - 1;
 867         ASSERT(filter2Size>0)
 868         filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
 869
 870         for(i=0; i<dstW; i++)
 871         {
 872                 int j;
 873                 SwsVector scaleFilter;
 874                 SwsVector *outVec;
 875
 876                 scaleFilter.coeff= filter + i*filterSize;
 877                 scaleFilter.length= filterSize;
 878
 879                 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
 880                 else          outVec= &scaleFilter;
 881
 882                 ASSERT(outVec->length == filter2Size)
 883                 //FIXME dstFilter
 884
 885                 for(j=0; j<outVec->length; j++)
 886                 {
 887                         filter2[i*filter2Size + j]= outVec->coeff[j];
 888                 }
 889
 890                 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
 891
 892                 if(outVec != &scaleFilter) freeVec(outVec);
 893         }
 894         free(filter); filter=NULL;
 895
 896         /* try to reduce the filter-size (step1 find size and shift left) */
 897         // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
 898         minFilterSize= 0;
 899         for(i=dstW-1; i>=0; i--)
 900         {
 901                 int min= filter2Size;
 902                 int j;
 903                 double cutOff=0.0;
 904
 905                 /* get rid off near zero elements on the left by shifting left */
 906                 for(j=0; j<filter2Size; j++)
 907                 {
 908                         int k;
 909                         cutOff += ABS(filter2[i*filter2Size]);
 910
 911                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
 912
 913                         /* preserve Monotonicity because the core cant handle the filter otherwise */
 914                         if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
 915
 916                         // Move filter coeffs left
 917                         for(k=1; k<filter2Size; k++)
 918                                 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
 919                         filter2[i*filter2Size + k - 1]= 0.0;
 920                         (*filterPos)[i]++;
 921                 }
 922
 923                 cutOff=0.0;
 924                 /* count near zeros on the right */
 925                 for(j=filter2Size-1; j>0; j--)
 926                 {
 927                         cutOff += ABS(filter2[i*filter2Size + j]);
 928
 929                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
 930                         min--;
 931                 }
 932
 933                 if(min>minFilterSize) minFilterSize= min;
 934         }
 935
 936         ASSERT(minFilterSize > 0)
 937         filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
 938         ASSERT(filterSize > 0)
 939         filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
 940         *outFilterSize= filterSize;
 941
 942         if(flags&SWS_PRINT_INFO)
 943                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
 944         /* try to reduce the filter-size (step2 reduce it) */
 945         for(i=0; i<dstW; i++)
 946         {
 947                 int j;
 948
 949                 for(j=0; j<filterSize; j++)
 950                 {
 951                         if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
 952                         else               filter[i*filterSize + j]= filter2[i*filter2Size + j];
 953                 }
 954         }
 955         free(filter2); filter2=NULL;
 956
 957
 958         //FIXME try to align filterpos if possible
 959
 960         //fix borders
 961         for(i=0; i<dstW; i++)
 962         {
 963                 int j;
 964                 if((*filterPos)[i] < 0)
 965                 {
 966                         // Move filter coeffs left to compensate for filterPos
 967                         for(j=1; j<filterSize; j++)
 968                         {
 969                                 int left= MAX(j + (*filterPos)[i], 0);
 970                                 filter[i*filterSize + left] += filter[i*filterSize + j];
 971                                 filter[i*filterSize + j]=0;
 972                         }
 973                         (*filterPos)[i]= 0;
 974                 }
 975
 976                 if((*filterPos)[i] + filterSize > srcW)
 977                 {
 978                         int shift= (*filterPos)[i] + filterSize - srcW;
 979                         // Move filter coeffs right to compensate for filterPos
 980                         for(j=filterSize-2; j>=0; j--)
 981                         {
 982                                 int right= MIN(j + shift, filterSize-1);
 983                                 filter[i*filterSize +right] += filter[i*filterSize +j];
 984                                 filter[i*filterSize +j]=0;
 985                         }
 986                         (*filterPos)[i]= srcW - filterSize;
 987                 }
 988         }
 989
 990         // Note the +1 is for the MMXscaler which reads over the end
 991         *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
 992         memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
 993
 994         /* Normalize & Store in outFilter */
 995         for(i=0; i<dstW; i++)
 996         {
 997                 int j;
 998                 double sum=0;
 999                 double scale= one;
1000                 for(j=0; j<filterSize; j++)
1001                 {
1002                         sum+= filter[i*filterSize + j];
1003                 }
1004                 scale/= sum;
1005                 for(j=0; j<filterSize; j++)
1006                 {
1007                         (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
1008                 }
1009         }
1010
1011         (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1012         for(i=0; i<*outFilterSize; i++)
1013         {
1014                 int j= dstW*(*outFilterSize);
1015                 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1016         }
1017
1018         free(filter);
1019 }
1020
1021 #ifdef ARCH_X86
1022 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
1023 {
1024         uint8_t *fragmentA;
1025         int imm8OfPShufW1A;
1026         int imm8OfPShufW2A;
1027         int fragmentLengthA;
1028         uint8_t *fragmentB;
1029         int imm8OfPShufW1B;
1030         int imm8OfPShufW2B;
1031         int fragmentLengthB;
1032         int fragmentPos;
1033
1034         int xpos, i;
1035
1036         // create an optimized horizontal scaling routine
1037
1038         //code fragment
1039
1040         asm volatile(
1041                 "jmp 9f                         \n\t"
1042         // Begin
1043                 "0:                             \n\t"
1044                 "movq (%%edx, %%eax), %%mm3     \n\t"
1045                 "movd (%%ecx, %%esi), %%mm0     \n\t"
1046                 "movd 1(%%ecx, %%esi), %%mm1    \n\t"
1047                 "punpcklbw %%mm7, %%mm1         \n\t"
1048                 "punpcklbw %%mm7, %%mm0         \n\t"
1049                 "pshufw $0xFF, %%mm1, %%mm1     \n\t"
1050                 "1:                             \n\t"
1051                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
1052                 "2:                             \n\t"
1053                 "psubw %%mm1, %%mm0             \n\t"
1054                 "movl 8(%%ebx, %%eax), %%esi    \n\t"
1055                 "pmullw %%mm3, %%mm0            \n\t"
1056                 "psllw $7, %%mm1                \n\t"
1057                 "paddw %%mm1, %%mm0             \n\t"
1058
1059                 "movq %%mm0, (%%edi, %%eax)     \n\t"
1060
1061                 "addl $8, %%eax                 \n\t"
1062         // End
1063                 "9:                             \n\t"
1064 //              "int $3\n\t"
1065                 "leal 0b, %0                    \n\t"
1066                 "leal 1b, %1                    \n\t"
1067                 "leal 2b, %2                    \n\t"
1068                 "decl %1                        \n\t"
1069                 "decl %2                        \n\t"
1070                 "subl %0, %1                    \n\t"
1071                 "subl %0, %2                    \n\t"
1072                 "leal 9b, %3                    \n\t"
1073                 "subl %0, %3                    \n\t"
1074
1075
1076                 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1077                 "=r" (fragmentLengthA)
1078         );
1079
1080         asm volatile(
1081                 "jmp 9f                         \n\t"
1082         // Begin
1083                 "0:                             \n\t"
1084                 "movq (%%edx, %%eax), %%mm3     \n\t"
1085                 "movd (%%ecx, %%esi), %%mm0     \n\t"
1086                 "punpcklbw %%mm7, %%mm0         \n\t"
1087                 "pshufw $0xFF, %%mm0, %%mm1     \n\t"
1088                 "1:                             \n\t"
1089                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
1090                 "2:                             \n\t"
1091                 "psubw %%mm1, %%mm0             \n\t"
1092                 "movl 8(%%ebx, %%eax), %%esi    \n\t"
1093                 "pmullw %%mm3, %%mm0            \n\t"
1094                 "psllw $7, %%mm1                \n\t"
1095                 "paddw %%mm1, %%mm0             \n\t"
1096
1097                 "movq %%mm0, (%%edi, %%eax)     \n\t"
1098
1099                 "addl $8, %%eax                 \n\t"
1100         // End
1101                 "9:                             \n\t"
1102 //              "int $3\n\t"
1103                 "leal 0b, %0                    \n\t"
1104                 "leal 1b, %1                    \n\t"
1105                 "leal 2b, %2                    \n\t"
1106                 "decl %1                        \n\t"
1107                 "decl %2                        \n\t"
1108                 "subl %0, %1                    \n\t"
1109                 "subl %0, %2                    \n\t"
1110                 "leal 9b, %3                    \n\t"
1111                 "subl %0, %3                    \n\t"
1112
1113
1114                 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1115                 "=r" (fragmentLengthB)
1116         );
1117
1118         xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1119         fragmentPos=0;
1120
1121         for(i=0; i<dstW/numSplits; i++)
1122         {
1123                 int xx=xpos>>16;
1124
1125                 if((i&3) == 0)
1126                 {
1127                         int a=0;
1128                         int b=((xpos+xInc)>>16) - xx;
1129                         int c=((xpos+xInc*2)>>16) - xx;
1130                         int d=((xpos+xInc*3)>>16) - xx;
1131
1132                         filter[i  ] = (( xpos         & 0xFFFF) ^ 0xFFFF)>>9;
1133                         filter[i+1] = (((xpos+xInc  ) & 0xFFFF) ^ 0xFFFF)>>9;
1134                         filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1135                         filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1136                         filterPos[i/2]= xx;
1137
1138                         if(d+1<4)
1139                         {
1140                                 int maxShift= 3-(d+1);
1141                                 int shift=0;
1142
1143                                 memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
1144
1145                                 funnyCode[fragmentPos + imm8OfPShufW1B]=
1146                                         (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
1147                                 funnyCode[fragmentPos + imm8OfPShufW2B]=
1148                                         a | (b<<2) | (c<<4) | (d<<6);
1149
1150                                 if(i+3>=dstW) shift=maxShift; //avoid overread
1151                                 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1152
1153                                 if(shift && i>=shift)
1154                                 {
1155                                         funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
1156                                         funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
1157                                         filterPos[i/2]-=shift;
1158                                 }
1159
1160                                 fragmentPos+= fragmentLengthB;
1161                         }
1162                         else
1163                         {
1164                                 int maxShift= 3-d;
1165                                 int shift=0;
1166
1167                                 memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
1168
1169                                 funnyCode[fragmentPos + imm8OfPShufW1A]=
1170                                 funnyCode[fragmentPos + imm8OfPShufW2A]=
1171                                         a | (b<<2) | (c<<4) | (d<<6);
1172
1173                                 if(i+4>=dstW) shift=maxShift; //avoid overread
1174                                 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
1175
1176                                 if(shift && i>=shift)
1177                                 {
1178                                         funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
1179                                         funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
1180                                         filterPos[i/2]-=shift;
1181                                 }
1182
1183                                 fragmentPos+= fragmentLengthA;
1184                         }
1185
1186                         funnyCode[fragmentPos]= RET;
1187                 }
1188                 xpos+=xInc;
1189         }
1190         filterPos[i/2]= xpos>>16; // needed to jump to the next part
1191 }
1192 #endif // ARCH_X86
1193
1194 //FIXME remove
1195 void SwScale_Init(){
1196 }
1197
1198 static void globalInit(){
1199     // generating tables:
1200     int i;
1201     for(i=0; i<768; i++){
1202         int c= MIN(MAX(i-256, 0), 255);
1203         clip_table[i]=c;
1204         yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1205         yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1206         yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1207         yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1208         yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1209     }
1210
1211     for(i=0; i<768; i++)
1212     {
1213         int v= clip_table[i];
1214         clip_table16b[i]=  v>>3;
1215         clip_table16g[i]= (v<<3)&0x07E0;
1216         clip_table16r[i]= (v<<8)&0xF800;
1217         clip_table15b[i]=  v>>3;
1218         clip_table15g[i]= (v<<2)&0x03E0;
1219         clip_table15r[i]= (v<<7)&0x7C00;
1220     }
1221
1222 cpuCaps= gCpuCaps;
1223
1224 #ifdef RUNTIME_CPUDETECT
1225 #ifdef CAN_COMPILE_X86_ASM
1226         // ordered per speed fasterst first
1227         if(gCpuCaps.hasMMX2)
1228                 swScale= swScale_MMX2;
1229         else if(gCpuCaps.has3DNow)
1230                 swScale= swScale_3DNow;
1231         else if(gCpuCaps.hasMMX)
1232                 swScale= swScale_MMX;
1233         else
1234                 swScale= swScale_C;
1235
1236 #else
1237         swScale= swScale_C;
1238         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1239 #endif
1240 #else //RUNTIME_CPUDETECT
1241 #ifdef HAVE_MMX2
1242         swScale= swScale_MMX2;
1243         cpuCaps.has3DNow = 0;
1244 #elif defined (HAVE_3DNOW)
1245         swScale= swScale_3DNow;
1246         cpuCaps.hasMMX2 = 0;
1247 #elif defined (HAVE_MMX)
1248         swScale= swScale_MMX;
1249         cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1250 #else
1251         swScale= swScale_C;
1252         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1253 #endif
1254 #endif //!RUNTIME_CPUDETECT
1255 }
1256
1257 /* Warper functions for yuv2bgr */
1258 static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1259              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1260         uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1261
1262         if(c->srcFormat==IMGFMT_YV12)
1263                 yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1264         else /* I420 & IYUV */
1265                 yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1266 }
1267
1268 static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1269              int srcSliceH, uint8_t* dst[], int dstStride[]){
1270
1271         if(dstStride[0]*3==srcStride[0]*4)
1272                 rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1273         else
1274         {
1275                 int i;
1276                 uint8_t *srcPtr= src[0];
1277                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1278
1279                 for(i=0; i<srcSliceH; i++)
1280                 {
1281                         rgb24to32(srcPtr, dstPtr, c->srcW*3);
1282                         srcPtr+= srcStride[0];
1283                         dstPtr+= dstStride[0];
1284                 }
1285         }
1286 }
1287
1288 static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1289              int srcSliceH, uint8_t* dst[], int dstStride[]){
1290
1291         if(dstStride[0]*4==srcStride[0]*3)
1292                 rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1293         else
1294         {
1295                 int i;
1296                 uint8_t *srcPtr= src[0];
1297                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1298
1299                 for(i=0; i<srcSliceH; i++)
1300                 {
1301                         rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1302                         srcPtr+= srcStride[0];
1303                         dstPtr+= dstStride[0];
1304                 }
1305         }
1306 }
1307
1308 static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1309              int srcSliceH, uint8_t* dst[], int dstStride[]){
1310
1311         if(dstStride[0]==srcStride[0])
1312                 rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1313         else
1314         {
1315                 int i;
1316                 uint8_t *srcPtr= src[0];
1317                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1318
1319                 for(i=0; i<srcSliceH; i++)
1320                 {
1321                         rgb15to16(srcPtr, dstPtr, c->srcW<<1);
1322                         srcPtr+= srcStride[0];
1323                         dstPtr+= dstStride[0];
1324                 }
1325         }
1326 }
1327
1328 static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1329              int srcSliceH, uint8_t* dst[], int dstStride[]){
1330
1331         rgb24toyv12(
1332                 src[0],
1333                 dst[0]+ srcSliceY    *dstStride[0],
1334                 dst[1]+(srcSliceY>>1)*dstStride[1],
1335                 dst[2]+(srcSliceY>>1)*dstStride[2],
1336                 c->srcW, srcSliceH,
1337                 dstStride[0], dstStride[1], srcStride[0]);
1338 }
1339
1340
1341 /* unscaled copy like stuff (assumes nearly identical formats) */
1342 static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1343              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1344
1345         int srcStride[3];
1346         uint8_t *src[3];
1347         uint8_t *dst[3];
1348
1349         if(c->srcFormat == IMGFMT_I420){
1350                 src[0]= srcParam[0];
1351                 src[1]= srcParam[2];
1352                 src[2]= srcParam[1];
1353                 srcStride[0]= srcStrideParam[0];
1354                 srcStride[1]= srcStrideParam[2];
1355                 srcStride[2]= srcStrideParam[1];
1356         }
1357         else if(c->srcFormat==IMGFMT_YV12){
1358                 src[0]= srcParam[0];
1359                 src[1]= srcParam[1];
1360                 src[2]= srcParam[2];
1361                 srcStride[0]= srcStrideParam[0];
1362                 srcStride[1]= srcStrideParam[1];
1363                 srcStride[2]= srcStrideParam[2];
1364         }
1365         else if(isPacked(c->srcFormat) || isGray(c->srcFormat)){
1366                 src[0]= srcParam[0];
1367                 src[1]=
1368                 src[2]= NULL;
1369                 srcStride[0]= srcStrideParam[0];
1370                 srcStride[1]=
1371                 srcStride[2]= 0;
1372         }
1373
1374         if(c->dstFormat == IMGFMT_I420){
1375                 dst[0]= dstParam[0];
1376                 dst[1]= dstParam[2];
1377                 dst[2]= dstParam[1];
1378
1379         }else{
1380                 dst[0]= dstParam[0];
1381                 dst[1]= dstParam[1];
1382                 dst[2]= dstParam[2];
1383         }
1384
1385         if(isPacked(c->srcFormat))
1386         {
1387                 if(dstStride[0]==srcStride[0])
1388                         memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1389                 else
1390                 {
1391                         int i;
1392                         uint8_t *srcPtr= src[0];
1393                         uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1394                         int length=0;
1395
1396                         /* universal length finder */
1397                         while(length+c->srcW <= ABS(dstStride[0])
1398                            && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
1399                         ASSERT(length!=0);
1400
1401                         for(i=0; i<srcSliceH; i++)
1402                         {
1403                                 memcpy(dstPtr, srcPtr, length);
1404                                 srcPtr+= srcStride[0];
1405                                 dstPtr+= dstStride[0];
1406                         }
1407                 }
1408         }
1409         else
1410         { /* Planar YUV */
1411                 int plane;
1412                 for(plane=0; plane<3; plane++)
1413                 {
1414                         int length= plane==0 ? c->srcW  : ((c->srcW+1)>>1);
1415                         int y=      plane==0 ? srcSliceY: ((srcSliceY+1)>>1);
1416                         int height= plane==0 ? srcSliceH: ((srcSliceH+1)>>1);
1417
1418                         if(dstStride[plane]==srcStride[plane])
1419                                 memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1420                         else
1421                         {
1422                                 int i;
1423                                 uint8_t *srcPtr= src[plane];
1424                                 uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1425                                 for(i=0; i<height; i++)
1426                                 {
1427                                         memcpy(dstPtr, srcPtr, length);
1428                                         srcPtr+= srcStride[plane];
1429                                         dstPtr+= dstStride[plane];
1430                                 }
1431                         }
1432                 }
1433         }
1434 }
1435
1436 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1437                          SwsFilter *srcFilter, SwsFilter *dstFilter){
1438
1439         SwsContext *c;
1440         int i;
1441         int usesFilter;
1442         SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1443
1444 #ifdef ARCH_X86
1445         if(gCpuCaps.hasMMX)
1446                 asm volatile("emms\n\t"::: "memory");
1447 #endif
1448
1449         if(swScale==NULL) globalInit();
1450
1451         /* avoid dupplicate Formats, so we dont need to check to much */
1452         if(srcFormat==IMGFMT_IYUV) srcFormat=IMGFMT_I420;
1453         if(srcFormat==IMGFMT_Y8)   srcFormat=IMGFMT_Y800;
1454         if(dstFormat==IMGFMT_Y8)   dstFormat=IMGFMT_Y800;
1455
1456         if(!isSupportedIn(srcFormat))
1457         {
1458                 mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1459                 return NULL;
1460         }
1461         if(!isSupportedOut(dstFormat))
1462         {
1463                  mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1464                 return NULL;
1465         }
1466
1467         /* sanity check */
1468         if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1469         {
1470                  mp_msg(MSGT_SWS,MSGL_ERR,"swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
1471                         srcW, srcH, dstW, dstH);
1472                 return NULL;
1473         }
1474
1475         if(!dstFilter) dstFilter= &dummyFilter;
1476         if(!srcFilter) srcFilter= &dummyFilter;
1477
1478         c= memalign(64, sizeof(SwsContext));
1479         memset(c, 0, sizeof(SwsContext));
1480
1481         c->srcW= srcW;
1482         c->srcH= srcH;
1483         c->dstW= dstW;
1484         c->dstH= dstH;
1485         c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1486         c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1487         c->flags= flags;
1488         c->dstFormat= dstFormat;
1489         c->srcFormat= srcFormat;
1490
1491         usesFilter=0;
1492         if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1493         if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1494         if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1495         if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1496         if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1497         if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1498         if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1499         if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1500
1501         /* unscaled special Cases */
1502         if(srcW==dstW && srcH==dstH && !usesFilter)
1503         {
1504                 /* yuv2bgr */
1505                 if(isPlanarYUV(srcFormat) && isBGR(dstFormat))
1506                 {
1507                         // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1508 #ifdef WORDS_BIGENDIAN
1509                         if(dstFormat==IMGFMT_BGR32)
1510                                 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1511                         else
1512                                 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1513 #else
1514                         yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1515 #endif
1516                         c->swScale= planarYuvToBgr;
1517
1518                         if(flags&SWS_PRINT_INFO)
1519                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n",
1520                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1521                         return c;
1522                 }
1523
1524                 /* simple copy */
1525                 if(srcFormat == dstFormat || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)))
1526                 {
1527                         c->swScale= simpleCopy;
1528
1529                         if(flags&SWS_PRINT_INFO)
1530                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n",
1531                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1532                         return c;
1533                 }
1534
1535                 /* bgr32to24 & rgb32to24*/
1536                 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1537                  ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1538                 {
1539                         c->swScale= bgr32to24Wrapper;
1540
1541                         if(flags&SWS_PRINT_INFO)
1542                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n",
1543                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1544                         return c;
1545                 }
1546
1547                 /* bgr24to32 & rgb24to32*/
1548                 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1549                  ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1550                 {
1551                         c->swScale= bgr24to32Wrapper;
1552
1553                         if(flags&SWS_PRINT_INFO)
1554                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n",
1555                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1556                         return c;
1557                 }
1558
1559                 /* bgr15to16 */
1560                 if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
1561                 {
1562                         c->swScale= bgr15to16Wrapper;
1563
1564                         if(flags&SWS_PRINT_INFO)
1565                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n",
1566                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1567                         return c;
1568                 }
1569
1570                 /* bgr24toYV12 */
1571                 if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
1572                 {
1573                         c->swScale= bgr24toyv12Wrapper;
1574
1575                         if(flags&SWS_PRINT_INFO)
1576                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using unscaled %s -> %s special converter\n",
1577                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1578                         return c;
1579                 }
1580         }
1581
1582         if(cpuCaps.hasMMX2)
1583         {
1584                 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1585                 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1586                 {
1587                         if(flags&SWS_PRINT_INFO)
1588                                 mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1589                 }
1590         }
1591         else
1592                 c->canMMX2BeUsed=0;
1593
1594
1595         /* dont use full vertical UV input/internaly if the source doesnt even have it */
1596         if(isHalfChrV(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_V);
1597         /* dont use full horizontal UV input if the source doesnt even have it */
1598         if(isHalfChrH(srcFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INP);
1599         /* dont use full horizontal UV internally if the destination doesnt even have it */
1600         if(isHalfChrH(dstFormat)) c->flags= flags= flags&(~SWS_FULL_CHR_H_INT);
1601
1602         if(flags&SWS_FULL_CHR_H_INP)    c->chrSrcW= srcW;
1603         else                            c->chrSrcW= (srcW+1)>>1;
1604
1605         if(flags&SWS_FULL_CHR_H_INT)    c->chrDstW= dstW;
1606         else                            c->chrDstW= (dstW+1)>>1;
1607
1608         if(flags&SWS_FULL_CHR_V)        c->chrSrcH= srcH;
1609         else                            c->chrSrcH= (srcH+1)>>1;
1610
1611         if(isHalfChrV(dstFormat))       c->chrDstH= (dstH+1)>>1;
1612         else                            c->chrDstH= dstH;
1613
1614         c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
1615         c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
1616
1617
1618         // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1619         // but only for the FAST_BILINEAR mode otherwise do correct scaling
1620         // n-2 is the last chrominance sample available
1621         // this is not perfect, but noone shuld notice the difference, the more correct variant
1622         // would be like the vertical one, but that would require some special code for the
1623         // first and last pixel
1624         if(flags&SWS_FAST_BILINEAR)
1625         {
1626                 if(c->canMMX2BeUsed)
1627                 {
1628                         c->lumXInc+= 20;
1629                         c->chrXInc+= 20;
1630                 }
1631                 //we dont use the x86asm scaler if mmx is available
1632                 else if(cpuCaps.hasMMX)
1633                 {
1634                         c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1635                         c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
1636                 }
1637         }
1638
1639         /* precalculate horizontal scaler filter coefficients */
1640         {
1641                 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1642
1643                 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1644                                  srcW      ,       dstW, filterAlign, 1<<14, flags,
1645                                  srcFilter->lumH, dstFilter->lumH);
1646                 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1647                                 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1648                                  srcFilter->chrH, dstFilter->chrH);
1649
1650 #ifdef ARCH_X86
1651 // cant downscale !!!
1652                 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1653                 {
1654                         c->lumMmx2Filter   = (int16_t*)memalign(8, (dstW        /8+8)*sizeof(int16_t));
1655                         c->chrMmx2Filter   = (int16_t*)memalign(8, (c->chrDstW  /4+8)*sizeof(int16_t));
1656                         c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW      /2/8+8)*sizeof(int32_t));
1657                         c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
1658
1659                         initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
1660                         initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
1661                 }
1662 #endif
1663         } // Init Horizontal stuff
1664
1665
1666
1667         /* precalculate vertical scaler filter coefficients */
1668         initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1669                         srcH      ,        dstH, 1, (1<<12)-4, flags,
1670                         srcFilter->lumV, dstFilter->lumV);
1671         initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1672                         (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1673                          srcFilter->chrV, dstFilter->chrV);
1674
1675         // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1676         c->vLumBufSize= c->vLumFilterSize;
1677         c->vChrBufSize= c->vChrFilterSize;
1678         for(i=0; i<dstH; i++)
1679         {
1680                 int chrI= i*c->chrDstH / dstH;
1681                 int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
1682                                  ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1683                 nextSlice&= ~1; // Slices start at even boundaries
1684                 if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
1685                         c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
1686                 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1687                         c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1688         }
1689
1690         // allocate pixbufs (we use dynamic allocation because otherwise we would need to
1691         c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1692         c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
1693         //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
1694         for(i=0; i<c->vLumBufSize; i++)
1695                 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1696         for(i=0; i<c->vChrBufSize; i++)
1697                 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1698
1699         //try to avoid drawing green stuff between the right end and the stride end
1700         for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1701         for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1702
1703         ASSERT(c->chrDstH <= dstH)
1704
1705         // pack filter data for mmx code
1706         if(cpuCaps.hasMMX)
1707         {
1708                 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
1709                 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
1710                 for(i=0; i<c->vLumFilterSize*dstH; i++)
1711                         c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1712                                 c->vLumFilter[i];
1713                 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1714                         c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1715                                 c->vChrFilter[i];
1716         }
1717
1718         if(flags&SWS_PRINT_INFO)
1719         {
1720 #ifdef DITHER1XBPP
1721                 char *dither= " dithered";
1722 #else
1723                 char *dither= "";
1724 #endif
1725                 if(flags&SWS_FAST_BILINEAR)
1726                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: FAST_BILINEAR scaler, ");
1727                 else if(flags&SWS_BILINEAR)
1728                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: BILINEAR scaler, ");
1729                 else if(flags&SWS_BICUBIC)
1730                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: BICUBIC scaler, ");
1731                 else if(flags&SWS_X)
1732                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: Experimental scaler, ");
1733                 else if(flags&SWS_POINT)
1734                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: Nearest Neighbor / POINT scaler, ");
1735                 else if(flags&SWS_AREA)
1736                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: Area Averageing scaler, ");
1737                 else
1738                         mp_msg(MSGT_SWS,MSGL_INFO,"\nSwScaler: ehh flags invalid?! ");
1739
1740                 if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
1741                         mp_msg(MSGT_SWS,MSGL_INFO,"from %s to%s %s ",
1742                                 vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
1743                 else
1744                         mp_msg(MSGT_SWS,MSGL_INFO,"from %s to %s ",
1745                                 vo_format_name(srcFormat), vo_format_name(dstFormat));
1746
1747                 if(cpuCaps.hasMMX2)
1748                         mp_msg(MSGT_SWS,MSGL_INFO,"using MMX2\n");
1749                 else if(cpuCaps.has3DNow)
1750                         mp_msg(MSGT_SWS,MSGL_INFO,"using 3DNOW\n");
1751                 else if(cpuCaps.hasMMX)
1752                         mp_msg(MSGT_SWS,MSGL_INFO,"using MMX\n");
1753                 else
1754                         mp_msg(MSGT_SWS,MSGL_INFO,"using C\n");
1755         }
1756
1757         if((flags & SWS_PRINT_INFO) && verbose)
1758         {
1759                 if(cpuCaps.hasMMX)
1760                 {
1761                         if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1762                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1763                         else
1764                         {
1765                                 if(c->hLumFilterSize==4)
1766                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1767                                 else if(c->hLumFilterSize==8)
1768                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1769                                 else
1770                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1771
1772                                 if(c->hChrFilterSize==4)
1773                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1774                                 else if(c->hChrFilterSize==8)
1775                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1776                                 else
1777                                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1778                         }
1779                 }
1780                 else
1781                 {
1782 #ifdef ARCH_X86
1783                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using X86-Asm scaler for horizontal scaling\n");
1784 #else
1785                         if(flags & SWS_FAST_BILINEAR)
1786                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1787                         else
1788                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using C scaler for horizontal scaling\n");
1789 #endif
1790                 }
1791                 if(isPlanarYUV(dstFormat))
1792                 {
1793                         if(c->vLumFilterSize==1)
1794                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1795                         else
1796                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1797                 }
1798                 else
1799                 {
1800                         if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1801                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1802                                        "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1803                         else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1804                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1805                         else
1806                                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1807                 }
1808
1809                 if(dstFormat==IMGFMT_BGR24)
1810                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR24 Converter\n",
1811                                 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1812                 else if(dstFormat==IMGFMT_BGR32)
1813                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1814                 else if(dstFormat==IMGFMT_BGR16)
1815                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1816                 else if(dstFormat==IMGFMT_BGR15)
1817                         mp_msg(MSGT_SWS,MSGL_V,"SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1818
1819                 mp_msg(MSGT_SWS,MSGL_V,"SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1820         }
1821         if((flags & SWS_PRINT_INFO) && verbose>1)
1822         {
1823                 mp_msg(MSGT_SWS,MSGL_DBG2,"SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1824                         c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
1825                 mp_msg(MSGT_SWS,MSGL_DBG2,"SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
1826                         c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
1827         }
1828
1829         c->swScale= swScale;
1830         return c;
1831 }
1832
1833 /**
1834  * returns a normalized gaussian curve used to filter stuff
1835  * quality=3 is high quality, lowwer is lowwer quality
1836  */
1837
1838 SwsVector *getGaussianVec(double variance, double quality){
1839         const int length= (int)(variance*quality + 0.5) | 1;
1840         int i;
1841         double *coeff= memalign(sizeof(double), length*sizeof(double));
1842         double middle= (length-1)*0.5;
1843         SwsVector *vec= malloc(sizeof(SwsVector));
1844
1845         vec->coeff= coeff;
1846         vec->length= length;
1847
1848         for(i=0; i<length; i++)
1849         {
1850                 double dist= i-middle;
1851                 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1852         }
1853
1854         normalizeVec(vec, 1.0);
1855
1856         return vec;
1857 }
1858
1859 SwsVector *getConstVec(double c, int length){
1860         int i;
1861         double *coeff= memalign(sizeof(double), length*sizeof(double));
1862         SwsVector *vec= malloc(sizeof(SwsVector));
1863
1864         vec->coeff= coeff;
1865         vec->length= length;
1866
1867         for(i=0; i<length; i++)
1868                 coeff[i]= c;
1869
1870         return vec;
1871 }
1872
1873
1874 SwsVector *getIdentityVec(void){
1875         double *coeff= memalign(sizeof(double), sizeof(double));
1876         SwsVector *vec= malloc(sizeof(SwsVector));
1877         coeff[0]= 1.0;
1878
1879         vec->coeff= coeff;
1880         vec->length= 1;
1881
1882         return vec;
1883 }
1884
1885 void normalizeVec(SwsVector *a, double height){
1886         int i;
1887         double sum=0;
1888         double inv;
1889
1890         for(i=0; i<a->length; i++)
1891                 sum+= a->coeff[i];
1892
1893         inv= height/sum;
1894
1895         for(i=0; i<a->length; i++)
1896                 a->coeff[i]*= height;
1897 }
1898
1899 void scaleVec(SwsVector *a, double scalar){
1900         int i;
1901
1902         for(i=0; i<a->length; i++)
1903                 a->coeff[i]*= scalar;
1904 }
1905
1906 static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
1907         int length= a->length + b->length - 1;
1908         double *coeff= memalign(sizeof(double), length*sizeof(double));
1909         int i, j;
1910         SwsVector *vec= malloc(sizeof(SwsVector));
1911
1912         vec->coeff= coeff;
1913         vec->length= length;
1914
1915         for(i=0; i<length; i++) coeff[i]= 0.0;
1916
1917         for(i=0; i<a->length; i++)
1918         {
1919                 for(j=0; j<b->length; j++)
1920                 {
1921                         coeff[i+j]+= a->coeff[i]*b->coeff[j];
1922                 }
1923         }
1924
1925         return vec;
1926 }
1927
1928 static SwsVector *sumVec(SwsVector *a, SwsVector *b){
1929         int length= MAX(a->length, b->length);
1930         double *coeff= memalign(sizeof(double), length*sizeof(double));
1931         int i;
1932         SwsVector *vec= malloc(sizeof(SwsVector));
1933
1934         vec->coeff= coeff;
1935         vec->length= length;
1936
1937         for(i=0; i<length; i++) coeff[i]= 0.0;
1938
1939         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1940         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1941
1942         return vec;
1943 }
1944
1945 static SwsVector *diffVec(SwsVector *a, SwsVector *b){
1946         int length= MAX(a->length, b->length);
1947         double *coeff= memalign(sizeof(double), length*sizeof(double));
1948         int i;
1949         SwsVector *vec= malloc(sizeof(SwsVector));
1950
1951         vec->coeff= coeff;
1952         vec->length= length;
1953
1954         for(i=0; i<length; i++) coeff[i]= 0.0;
1955
1956         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1957         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1958
1959         return vec;
1960 }
1961
1962 /* shift left / or right if "shift" is negative */
1963 static SwsVector *getShiftedVec(SwsVector *a, int shift){
1964         int length= a->length + ABS(shift)*2;
1965         double *coeff= memalign(sizeof(double), length*sizeof(double));
1966         int i;
1967         SwsVector *vec= malloc(sizeof(SwsVector));
1968
1969         vec->coeff= coeff;
1970         vec->length= length;
1971
1972         for(i=0; i<length; i++) coeff[i]= 0.0;
1973
1974         for(i=0; i<a->length; i++)
1975         {
1976                 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1977         }
1978
1979         return vec;
1980 }
1981
1982 void shiftVec(SwsVector *a, int shift){
1983         SwsVector *shifted= getShiftedVec(a, shift);
1984         free(a->coeff);
1985         a->coeff= shifted->coeff;
1986         a->length= shifted->length;
1987         free(shifted);
1988 }
1989
1990 void addVec(SwsVector *a, SwsVector *b){
1991         SwsVector *sum= sumVec(a, b);
1992         free(a->coeff);
1993         a->coeff= sum->coeff;
1994         a->length= sum->length;
1995         free(sum);
1996 }
1997
1998 void subVec(SwsVector *a, SwsVector *b){
1999         SwsVector *diff= diffVec(a, b);
2000         free(a->coeff);
2001         a->coeff= diff->coeff;
2002         a->length= diff->length;
2003         free(diff);
2004 }
2005
2006 void convVec(SwsVector *a, SwsVector *b){
2007         SwsVector *conv= getConvVec(a, b);
2008         free(a->coeff);
2009         a->coeff= conv->coeff;
2010         a->length= conv->length;
2011         free(conv);
2012 }
2013
2014 SwsVector *cloneVec(SwsVector *a){
2015         double *coeff= memalign(sizeof(double), a->length*sizeof(double));
2016         int i;
2017         SwsVector *vec= malloc(sizeof(SwsVector));
2018
2019         vec->coeff= coeff;
2020         vec->length= a->length;
2021
2022         for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
2023
2024         return vec;
2025 }
2026
2027 void printVec(SwsVector *a){
2028         int i;
2029         double max=0;
2030         double min=0;
2031         double range;
2032
2033         for(i=0; i<a->length; i++)
2034                 if(a->coeff[i]>max) max= a->coeff[i];
2035
2036         for(i=0; i<a->length; i++)
2037                 if(a->coeff[i]<min) min= a->coeff[i];
2038
2039         range= max - min;
2040
2041         for(i=0; i<a->length; i++)
2042         {
2043                 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
2044                 printf("%1.3f ", a->coeff[i]);
2045                 for(;x>0; x--) printf(" ");
2046                 printf("|\n");
2047         }
2048 }
2049
2050 void freeVec(SwsVector *a){
2051         if(!a) return;
2052         if(a->coeff) free(a->coeff);
2053         a->coeff=NULL;
2054         a->length=0;
2055         free(a);
2056 }
2057
2058 void freeSwsContext(SwsContext *c){
2059         int i;
2060
2061         if(!c) return;
2062
2063         if(c->lumPixBuf)
2064         {
2065                 for(i=0; i<c->vLumBufSize; i++)
2066                 {
2067                         if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
2068                         c->lumPixBuf[i]=NULL;
2069                 }
2070                 free(c->lumPixBuf);
2071                 c->lumPixBuf=NULL;
2072         }
2073
2074         if(c->chrPixBuf)
2075         {
2076                 for(i=0; i<c->vChrBufSize; i++)
2077                 {
2078                         if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
2079                         c->chrPixBuf[i]=NULL;
2080                 }
2081                 free(c->chrPixBuf);
2082                 c->chrPixBuf=NULL;
2083         }
2084
2085         if(c->vLumFilter) free(c->vLumFilter);
2086         c->vLumFilter = NULL;
2087         if(c->vChrFilter) free(c->vChrFilter);
2088         c->vChrFilter = NULL;
2089         if(c->hLumFilter) free(c->hLumFilter);
2090         c->hLumFilter = NULL;
2091         if(c->hChrFilter) free(c->hChrFilter);
2092         c->hChrFilter = NULL;
2093
2094         if(c->vLumFilterPos) free(c->vLumFilterPos);
2095         c->vLumFilterPos = NULL;
2096         if(c->vChrFilterPos) free(c->vChrFilterPos);
2097         c->vChrFilterPos = NULL;
2098         if(c->hLumFilterPos) free(c->hLumFilterPos);
2099         c->hLumFilterPos = NULL;
2100         if(c->hChrFilterPos) free(c->hChrFilterPos);
2101         c->hChrFilterPos = NULL;
2102
2103         if(c->lumMmxFilter) free(c->lumMmxFilter);
2104         c->lumMmxFilter = NULL;
2105         if(c->chrMmxFilter) free(c->chrMmxFilter);
2106         c->chrMmxFilter = NULL;
2107
2108         if(c->lumMmx2Filter) free(c->lumMmx2Filter);
2109         c->lumMmx2Filter=NULL;
2110         if(c->chrMmx2Filter) free(c->chrMmx2Filter);
2111         c->chrMmx2Filter=NULL;
2112         if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
2113         c->lumMmx2FilterPos=NULL;
2114         if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
2115         c->chrMmx2FilterPos=NULL;
2116
2117         free(c);
2118 }
2119
2120