git.sesse.net Git - ffmpeg/blob - postproc/swscale.c

   1 /*
   2     Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17 */
  18
  19 /*
  20   supported Input formats: YV12, I420/IYUV, YUY2, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09
  21   supported output formats: YV12, I420/IYUV, BGR15, BGR16, BGR24, BGR32, Y8/Y800, YVU9/IF09
  22   BGR15/16 support dithering
  23
  24   unscaled special converters
  25   YV12/I420/IYUV -> BGR15/BGR16/BGR24/BGR32
  26   YV12/I420/IYUV -> YV12/I420/IYUV
  27   YUY2/BGR15/BGR16/BGR24/BGR32/RGB24/RGB32 -> same format
  28   BGR24 -> BGR32 & RGB24 -> RGB32
  29   BGR32 -> BGR24 & RGB32 -> RGB24
  30   BGR15 -> BGR16
  31 */
  32
  33 /*
  34 tested special converters
  35  YV12/I420 -> BGR16
  36  YV12 -> YV12
  37  BGR15 -> BGR16
  38  BGR16 -> BGR16
  39
  40 untested special converters
  41   YV12/I420 -> BGR15/BGR24/BGR32 (its the yuv2rgb stuff, so it should be ok)
  42   YV12/I420 -> YV12/I420
  43   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  44   BGR24 -> BGR32 & RGB24 -> RGB32
  45   BGR32 -> BGR24 & RGB32 -> RGB24
  46   BGR24 -> YV12
  47 */
  48
  49 #include <inttypes.h>
  50 #include <string.h>
  51 #include <math.h>
  52 #include <stdio.h>
  53 #include "../config.h"
  54 #include "../mangle.h"
  55 #include <assert.h>
  56 #ifdef HAVE_MALLOC_H
  57 #include <malloc.h>
  58 #else
  59 #include <stdlib.h>
  60 #endif
  61 #include "swscale.h"
  62 #include "../cpudetect.h"
  63 #include "../bswap.h"
  64 #include "../libvo/img_format.h"
  65 #include "rgb2rgb.h"
  66 #include "../libvo/fastmemcpy.h"
  67 #include "../mp_msg.h"
  68
  69 #define MSG_WARN(args...) mp_msg(MSGT_SWS,MSGL_WARN, ##args )
  70 #define MSG_FATAL(args...) mp_msg(MSGT_SWS,MSGL_FATAL, ##args )
  71 #define MSG_ERR(args...) mp_msg(MSGT_SWS,MSGL_ERR, ##args )
  72 #define MSG_V(args...) mp_msg(MSGT_SWS,MSGL_V, ##args )
  73 #define MSG_DBG2(args...) mp_msg(MSGT_SWS,MSGL_DBG2, ##args )
  74 #define MSG_INFO(args...) mp_msg(MSGT_SWS,MSGL_INFO, ##args )
  75
  76 #undef MOVNTQ
  77 #undef PAVGB
  78
  79 //#undef HAVE_MMX2
  80 //#define HAVE_3DNOW
  81 //#undef HAVE_MMX
  82 //#undef ARCH_X86
  83 //#define WORDS_BIGENDIAN
  84 #define DITHER1XBPP
  85
  86 #define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
  87
  88 #define RET 0xC3 //near return opcode for X86
  89
  90 #ifdef MP_DEBUG
  91 #define ASSERT(x) assert(x);
  92 #else
  93 #define ASSERT(x) ;
  94 #endif
  95
  96 #ifdef M_PI
  97 #define PI M_PI
  98 #else
  99 #define PI 3.14159265358979323846
 100 #endif
 101
 102 //FIXME replace this with something faster
 103 #define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YVU9)
 104 #define isYUV(x)       ((x)==IMGFMT_YUY2 || isPlanarYUV(x))
 105 #define isGray(x)      ((x)==IMGFMT_Y800)
 106 #define isSupportedIn(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_YUY2 \
 107                         || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
 108                         || (x)==IMGFMT_RGB32|| (x)==IMGFMT_RGB24\
 109                         || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
 110 #define isSupportedOut(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 \
 111                         || (x)==IMGFMT_BGR32|| (x)==IMGFMT_BGR24|| (x)==IMGFMT_BGR16|| (x)==IMGFMT_BGR15\
 112                         || (x)==IMGFMT_Y800 || (x)==IMGFMT_YVU9)
 113 #define isRGB(x)       (((x)&IMGFMT_RGB_MASK)==IMGFMT_RGB)
 114 #define isBGR(x)       (((x)&IMGFMT_BGR_MASK)==IMGFMT_BGR)
 115 #define isPacked(x)    ((x)==IMGFMT_YUY2 || isRGB(x) || isBGR(x))
 116
 117 #define RGB2YUV_SHIFT 16
 118 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
 119 #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
 120 #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
 121 #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
 122 #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
 123 #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
 124 #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
 125 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
 126 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
 127
 128 extern int verbose; // defined in mplayer.c
 129 /*
 130 NOTES
 131 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
 132
 133 TODO
 134 more intelligent missalignment avoidance for the horizontal scaler
 135 write special vertical cubic upscale version
 136 Optimize C code (yv12 / minmax)
 137 add support for packed pixel yuv input & output
 138 add support for Y8 output
 139 optimize bgr24 & bgr32
 140 add BGR4 output support
 141 write special BGR->BGR scaler
 142 deglobalize yuv2rgb*.c
 143 */
 144
 145 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
 146 #define MIN(a,b) ((a) > (b) ? (b) : (a))
 147 #define MAX(a,b) ((a) < (b) ? (b) : (a))
 148
 149 #ifdef ARCH_X86
 150 #define CAN_COMPILE_X86_ASM
 151 #endif
 152
 153 #ifdef CAN_COMPILE_X86_ASM
 154 static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
 155 static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
 156 static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
 157 static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
 158 static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
 159 static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
 160 static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
 161 static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
 162 static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
 163 static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
 164 static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
 165 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
 166 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
 167 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
 168 static uint64_t __attribute__((aligned(8))) bm01010101=0x00FF00FF00FF00FFLL;
 169
 170 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
 171 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
 172 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
 173 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
 174
 175 static uint64_t __attribute__((aligned(8))) dither4[2]={
 176         0x0103010301030103LL,
 177         0x0200020002000200LL,};
 178
 179 static uint64_t __attribute__((aligned(8))) dither8[2]={
 180         0x0602060206020602LL,
 181         0x0004000400040004LL,};
 182
 183 static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
 184 static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
 185 static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
 186 static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
 187 static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
 188 static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
 189
 190 static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
 191 static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
 192 static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
 193
 194 #ifdef FAST_BGR2YV12
 195 static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000000210041000DULL;
 196 static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL;
 197 static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL;
 198 #else
 199 static const uint64_t bgr2YCoeff  __attribute__((aligned(8))) = 0x000020E540830C8BULL;
 200 static const uint64_t bgr2UCoeff  __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL;
 201 static const uint64_t bgr2VCoeff  __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL;
 202 #endif
 203 static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL;
 204 static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL;
 205 static const uint64_t w1111       __attribute__((aligned(8))) = 0x0001000100010001ULL;
 206
 207 // FIXME remove
 208 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
 209 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
 210 #endif
 211
 212 // clipping helper table for C implementations:
 213 static unsigned char clip_table[768];
 214
 215 static unsigned short clip_table16b[768];
 216 static unsigned short clip_table16g[768];
 217 static unsigned short clip_table16r[768];
 218 static unsigned short clip_table15b[768];
 219 static unsigned short clip_table15g[768];
 220 static unsigned short clip_table15r[768];
 221
 222 // yuv->rgb conversion tables:
 223 static    int yuvtab_2568[256];
 224 static    int yuvtab_3343[256];
 225 static    int yuvtab_0c92[256];
 226 static    int yuvtab_1a1e[256];
 227 static    int yuvtab_40cf[256];
 228 // Needed for cubic scaler to catch overflows
 229 static    int clip_yuvtab_2568[768];
 230 static    int clip_yuvtab_3343[768];
 231 static    int clip_yuvtab_0c92[768];
 232 static    int clip_yuvtab_1a1e[768];
 233 static    int clip_yuvtab_40cf[768];
 234
 235 //global sws_flags from the command line
 236 int sws_flags=2;
 237
 238 //global srcFilter
 239 SwsFilter src_filter= {NULL, NULL, NULL, NULL};
 240
 241 float sws_lum_gblur= 0.0;
 242 float sws_chr_gblur= 0.0;
 243 int sws_chr_vshift= 0;
 244 int sws_chr_hshift= 0;
 245 float sws_chr_sharpen= 0.0;
 246 float sws_lum_sharpen= 0.0;
 247
 248 /* cpuCaps combined from cpudetect and whats actually compiled in
 249    (if there is no support for something compiled in it wont appear here) */
 250 static CpuCaps cpuCaps;
 251
 252 void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
 253              int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
 254
 255 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
 256
 257 #ifdef CAN_COMPILE_X86_ASM
 258 void in_asm_used_var_warning_killer()
 259 {
 260  volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
 261  bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
 262  M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0]+bm01010101;
 263  if(i) i=0;
 264 }
 265 #endif
 266
 267 static int testFormat[]={
 268 IMGFMT_YVU9,
 269 IMGFMT_YV12,
 270 //IMGFMT_IYUV,
 271 IMGFMT_I420,
 272 IMGFMT_BGR15,
 273 IMGFMT_BGR16,
 274 IMGFMT_BGR24,
 275 IMGFMT_BGR32,
 276 //IMGFMT_Y8,
 277 IMGFMT_Y800,
 278 //IMGFMT_YUY2,
 279 0
 280 };
 281
 282 static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
 283         int x,y;
 284         uint64_t ssd=0;
 285
 286         for(y=0; y<h; y++){
 287                 for(x=0; x<w; x++){
 288                         int d= src1[x + y*stride1] - src2[x + y*stride2];
 289                         ssd+= d*d;
 290                 }
 291         }
 292         return ssd;
 293 }
 294
 295 // test by ref -> src -> dst -> out & compare out against ref
 296 // ref & out are YV12
 297 static void doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat,
 298                    int srcW, int srcH, int dstW, int dstH, int flags){
 299         uint8_t *src[3];
 300         uint8_t *dst[3];
 301         uint8_t *out[3];
 302         int srcStride[3], dstStride[3];
 303         int i;
 304         uint64_t ssdY, ssdU, ssdV;
 305         SwsContext *srcContext, *dstContext, *outContext;
 306
 307         for(i=0; i<3; i++){
 308                 srcStride[i]= srcW*4;
 309                 dstStride[i]= dstW*4;
 310                 src[i]= malloc(srcStride[i]*srcH);
 311                 dst[i]= malloc(dstStride[i]*dstH);
 312                 out[i]= malloc(refStride[i]*h);
 313         }
 314
 315         srcContext= getSwsContext(w, h, IMGFMT_YV12, srcW, srcH, srcFormat, flags, NULL, NULL);
 316         dstContext= getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL);
 317         outContext= getSwsContext(dstW, dstH, dstFormat, w, h, IMGFMT_YV12, flags, NULL, NULL);
 318         if(srcContext==NULL ||dstContext==NULL ||outContext==NULL){
 319                 printf("Failed allocating swsContext\n");
 320                 goto end;
 321         }
 322 //      printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2],
 323 //              (int)src[0], (int)src[1], (int)src[2]);
 324
 325         srcContext->swScale(srcContext, ref, refStride, 0, h   , src, srcStride);
 326         dstContext->swScale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
 327         outContext->swScale(outContext, dst, dstStride, 0, dstH, out, refStride);
 328
 329         ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
 330         ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
 331         ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);
 332
 333         if(isGray(srcFormat) || isGray(dstFormat)) ssdU=ssdV=0; //FIXME check that output is really gray
 334
 335         ssdY/= w*h;
 336         ssdU/= w*h/4;
 337         ssdV/= w*h/4;
 338
 339         if(ssdY>100 || ssdU>50 || ssdV>50){
 340                 printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n",
 341                         vo_format_name(srcFormat), srcW, srcH,
 342                         vo_format_name(dstFormat), dstW, dstH,
 343                         flags,
 344                         ssdY, ssdU, ssdV);
 345         }
 346
 347         end:
 348
 349         freeSwsContext(srcContext);
 350         freeSwsContext(dstContext);
 351         freeSwsContext(outContext);
 352
 353         for(i=0; i<3; i++){
 354                 free(src[i]);
 355                 free(dst[i]);
 356                 free(out[i]);
 357         }
 358 }
 359
 360 static void selfTest(uint8_t *src[3], int stride[3], int w, int h){
 361         int srcFormat, dstFormat, srcFormatIndex, dstFormatIndex;
 362         int srcW, srcH, dstW, dstH;
 363         int flags;
 364
 365         for(srcFormatIndex=0; ;srcFormatIndex++){
 366                 srcFormat= testFormat[srcFormatIndex];
 367                 if(!srcFormat) break;
 368                 for(dstFormatIndex=0; ;dstFormatIndex++){
 369                         dstFormat= testFormat[dstFormatIndex];
 370                         if(!dstFormat) break;
 371                         if(!isSupportedOut(dstFormat)) continue;
 372
 373                         srcW= w+w/3;
 374                         srcH= h+h/3;
 375                         for(dstW=w; dstW<w*2; dstW+= dstW/3){
 376                                 for(dstH=h; dstH<h*2; dstH+= dstH/3){
 377                                         for(flags=1; flags<33; flags*=2)
 378                                                 doTest(src, stride, w, h, srcFormat, dstFormat,
 379                                                         srcW, srcH, dstW, dstH, flags);
 380                                 }
 381                         }
 382                 }
 383         }
 384 }
 385
 386 static inline void yuv2yuvXinC(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 387                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 388                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest)
 389 {
 390         //FIXME Optimize (just quickly writen not opti..)
 391         int i;
 392         for(i=0; i<c->dstW; i++)
 393         {
 394                 int val=0;
 395                 int j;
 396                 for(j=0; j<lumFilterSize; j++)
 397                         val += lumSrc[j][i] * lumFilter[j];
 398
 399                 dest[i]= MIN(MAX(val>>19, 0), 255);
 400         }
 401
 402         if(uDest != NULL)
 403                 for(i=0; i<c->chrDstW; i++)
 404                 {
 405                         int u=0;
 406                         int v=0;
 407                         int j;
 408                         for(j=0; j<chrFilterSize; j++)
 409                         {
 410                                 u += chrSrc[j][i] * chrFilter[j];
 411                                 v += chrSrc[j][i + 2048] * chrFilter[j];
 412                         }
 413
 414                         uDest[i]= MIN(MAX(u>>19, 0), 255);
 415                         vDest[i]= MIN(MAX(v>>19, 0), 255);
 416                 }
 417 }
 418
 419 static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 420                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 421                                     uint8_t *dest, int dstW, int dstFormat)
 422 {
 423         if(dstFormat==IMGFMT_BGR32)
 424         {
 425                 int i;
 426 #ifdef WORDS_BIGENDIAN
 427         dest++;
 428 #endif
 429                 for(i=0; i<(dstW>>1); i++){
 430                         int j;
 431                         int Y1=0;
 432                         int Y2=0;
 433                         int U=0;
 434                         int V=0;
 435                         int Cb, Cr, Cg;
 436                         for(j=0; j<lumFilterSize; j++)
 437                         {
 438                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 439                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 440                         }
 441                         for(j=0; j<chrFilterSize; j++)
 442                         {
 443                                 U += chrSrc[j][i] * chrFilter[j];
 444                                 V += chrSrc[j][i+2048] * chrFilter[j];
 445                         }
 446                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 447                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 448                         U >>= 19;
 449                         V >>= 19;
 450
 451                         Cb= clip_yuvtab_40cf[U+ 256];
 452                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 453                         Cr= clip_yuvtab_3343[V+ 256];
 454
 455                         dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
 456                         dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
 457                         dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
 458
 459                         dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
 460                         dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
 461                         dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
 462                 }
 463         }
 464         else if(dstFormat==IMGFMT_BGR24)
 465         {
 466                 int i;
 467                 for(i=0; i<(dstW>>1); i++){
 468                         int j;
 469                         int Y1=0;
 470                         int Y2=0;
 471                         int U=0;
 472                         int V=0;
 473                         int Cb, Cr, Cg;
 474                         for(j=0; j<lumFilterSize; j++)
 475                         {
 476                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 477                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 478                         }
 479                         for(j=0; j<chrFilterSize; j++)
 480                         {
 481                                 U += chrSrc[j][i] * chrFilter[j];
 482                                 V += chrSrc[j][i+2048] * chrFilter[j];
 483                         }
 484                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 485                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 486                         U >>= 19;
 487                         V >>= 19;
 488
 489                         Cb= clip_yuvtab_40cf[U+ 256];
 490                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 491                         Cr= clip_yuvtab_3343[V+ 256];
 492
 493                         dest[0]=clip_table[((Y1 + Cb) >>13)];
 494                         dest[1]=clip_table[((Y1 + Cg) >>13)];
 495                         dest[2]=clip_table[((Y1 + Cr) >>13)];
 496
 497                         dest[3]=clip_table[((Y2 + Cb) >>13)];
 498                         dest[4]=clip_table[((Y2 + Cg) >>13)];
 499                         dest[5]=clip_table[((Y2 + Cr) >>13)];
 500                         dest+=6;
 501                 }
 502         }
 503         else if(dstFormat==IMGFMT_BGR16)
 504         {
 505                 int i;
 506 #ifdef DITHER1XBPP
 507                 static int ditherb1=1<<14;
 508                 static int ditherg1=1<<13;
 509                 static int ditherr1=2<<14;
 510                 static int ditherb2=3<<14;
 511                 static int ditherg2=3<<13;
 512                 static int ditherr2=0<<14;
 513
 514                 ditherb1 ^= (1^2)<<14;
 515                 ditherg1 ^= (1^2)<<13;
 516                 ditherr1 ^= (1^2)<<14;
 517                 ditherb2 ^= (3^0)<<14;
 518                 ditherg2 ^= (3^0)<<13;
 519                 ditherr2 ^= (3^0)<<14;
 520 #else
 521                 const int ditherb1=0;
 522                 const int ditherg1=0;
 523                 const int ditherr1=0;
 524                 const int ditherb2=0;
 525                 const int ditherg2=0;
 526                 const int ditherr2=0;
 527 #endif
 528                 for(i=0; i<(dstW>>1); i++){
 529                         int j;
 530                         int Y1=0;
 531                         int Y2=0;
 532                         int U=0;
 533                         int V=0;
 534                         int Cb, Cr, Cg;
 535                         for(j=0; j<lumFilterSize; j++)
 536                         {
 537                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 538                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 539                         }
 540                         for(j=0; j<chrFilterSize; j++)
 541                         {
 542                                 U += chrSrc[j][i] * chrFilter[j];
 543                                 V += chrSrc[j][i+2048] * chrFilter[j];
 544                         }
 545                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 546                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 547                         U >>= 19;
 548                         V >>= 19;
 549
 550                         Cb= clip_yuvtab_40cf[U+ 256];
 551                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 552                         Cr= clip_yuvtab_3343[V+ 256];
 553
 554                         ((uint16_t*)dest)[2*i] =
 555                                 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
 556                                 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
 557                                 clip_table16r[(Y1 + Cr + ditherr1) >>13];
 558
 559                         ((uint16_t*)dest)[2*i+1] =
 560                                 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
 561                                 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
 562                                 clip_table16r[(Y2 + Cr + ditherr2) >>13];
 563                 }
 564         }
 565         else if(dstFormat==IMGFMT_BGR15)
 566         {
 567                 int i;
 568 #ifdef DITHER1XBPP
 569                 static int ditherb1=1<<14;
 570                 static int ditherg1=1<<14;
 571                 static int ditherr1=2<<14;
 572                 static int ditherb2=3<<14;
 573                 static int ditherg2=3<<14;
 574                 static int ditherr2=0<<14;
 575
 576                 ditherb1 ^= (1^2)<<14;
 577                 ditherg1 ^= (1^2)<<14;
 578                 ditherr1 ^= (1^2)<<14;
 579                 ditherb2 ^= (3^0)<<14;
 580                 ditherg2 ^= (3^0)<<14;
 581                 ditherr2 ^= (3^0)<<14;
 582 #else
 583                 const int ditherb1=0;
 584                 const int ditherg1=0;
 585                 const int ditherr1=0;
 586                 const int ditherb2=0;
 587                 const int ditherg2=0;
 588                 const int ditherr2=0;
 589 #endif
 590                 for(i=0; i<(dstW>>1); i++){
 591                         int j;
 592                         int Y1=0;
 593                         int Y2=0;
 594                         int U=0;
 595                         int V=0;
 596                         int Cb, Cr, Cg;
 597                         for(j=0; j<lumFilterSize; j++)
 598                         {
 599                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 600                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 601                         }
 602                         for(j=0; j<chrFilterSize; j++)
 603                         {
 604                                 U += chrSrc[j][i] * chrFilter[j];
 605                                 V += chrSrc[j][i+2048] * chrFilter[j];
 606                         }
 607                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 608                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 609                         U >>= 19;
 610                         V >>= 19;
 611
 612                         Cb= clip_yuvtab_40cf[U+ 256];
 613                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 614                         Cr= clip_yuvtab_3343[V+ 256];
 615
 616                         ((uint16_t*)dest)[2*i] =
 617                                 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
 618                                 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
 619                                 clip_table15r[(Y1 + Cr + ditherr1) >>13];
 620
 621                         ((uint16_t*)dest)[2*i+1] =
 622                                 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
 623                                 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
 624                                 clip_table15r[(Y2 + Cr + ditherr2) >>13];
 625                 }
 626         }
 627 }
 628
 629
 630 //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
 631 //Plain C versions
 632 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
 633 #define COMPILE_C
 634 #endif
 635
 636 #ifdef CAN_COMPILE_X86_ASM
 637
 638 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 639 #define COMPILE_MMX
 640 #endif
 641
 642 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
 643 #define COMPILE_MMX2
 644 #endif
 645
 646 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 647 #define COMPILE_3DNOW
 648 #endif
 649 #endif //CAN_COMPILE_X86_ASM
 650
 651 #undef HAVE_MMX
 652 #undef HAVE_MMX2
 653 #undef HAVE_3DNOW
 654
 655 #ifdef COMPILE_C
 656 #undef HAVE_MMX
 657 #undef HAVE_MMX2
 658 #undef HAVE_3DNOW
 659 #define RENAME(a) a ## _C
 660 #include "swscale_template.c"
 661 #endif
 662
 663 #ifdef CAN_COMPILE_X86_ASM
 664
 665 //X86 versions
 666 /*
 667 #undef RENAME
 668 #undef HAVE_MMX
 669 #undef HAVE_MMX2
 670 #undef HAVE_3DNOW
 671 #define ARCH_X86
 672 #define RENAME(a) a ## _X86
 673 #include "swscale_template.c"
 674 */
 675 //MMX versions
 676 #ifdef COMPILE_MMX
 677 #undef RENAME
 678 #define HAVE_MMX
 679 #undef HAVE_MMX2
 680 #undef HAVE_3DNOW
 681 #define RENAME(a) a ## _MMX
 682 #include "swscale_template.c"
 683 #endif
 684
 685 //MMX2 versions
 686 #ifdef COMPILE_MMX2
 687 #undef RENAME
 688 #define HAVE_MMX
 689 #define HAVE_MMX2
 690 #undef HAVE_3DNOW
 691 #define RENAME(a) a ## _MMX2
 692 #include "swscale_template.c"
 693 #endif
 694
 695 //3DNOW versions
 696 #ifdef COMPILE_3DNOW
 697 #undef RENAME
 698 #define HAVE_MMX
 699 #undef HAVE_MMX2
 700 #define HAVE_3DNOW
 701 #define RENAME(a) a ## _3DNow
 702 #include "swscale_template.c"
 703 #endif
 704
 705 #endif //CAN_COMPILE_X86_ASM
 706
 707 // minor note: the HAVE_xyz is messed up after that line so dont use it
 708
 709
 710 // old global scaler, dont use for new code
 711 // will use sws_flags from the command line
 712 void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
 713                              int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
 714                              int srcW, int srcH, int dstW, int dstH){
 715
 716         static SwsContext *context=NULL;
 717         int dstFormat;
 718         int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
 719
 720         switch(dstbpp)
 721         {
 722                 case 8 : dstFormat= IMGFMT_Y8;          break;
 723                 case 12: dstFormat= IMGFMT_YV12;        break;
 724                 case 15: dstFormat= IMGFMT_BGR15;       break;
 725                 case 16: dstFormat= IMGFMT_BGR16;       break;
 726                 case 24: dstFormat= IMGFMT_BGR24;       break;
 727                 case 32: dstFormat= IMGFMT_BGR32;       break;
 728                 default: return;
 729         }
 730
 731         if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
 732
 733         context->swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
 734 }
 735
 736 // will use sws_flags & src_filter (from cmd line)
 737 SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
 738 {
 739         int flags=0;
 740         static int firstTime=1;
 741
 742 #ifdef ARCH_X86
 743         if(gCpuCaps.hasMMX)
 744                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
 745 #endif
 746         if(firstTime)
 747         {
 748                 firstTime=0;
 749                 flags= SWS_PRINT_INFO;
 750         }
 751         else if(verbose>1) flags= SWS_PRINT_INFO;
 752
 753         if(src_filter.lumH) freeVec(src_filter.lumH);
 754         if(src_filter.lumV) freeVec(src_filter.lumV);
 755         if(src_filter.chrH) freeVec(src_filter.chrH);
 756         if(src_filter.chrV) freeVec(src_filter.chrV);
 757
 758         if(sws_lum_gblur!=0.0){
 759                 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
 760                 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
 761         }else{
 762                 src_filter.lumH= getIdentityVec();
 763                 src_filter.lumV= getIdentityVec();
 764         }
 765
 766         if(sws_chr_gblur!=0.0){
 767                 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
 768                 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
 769         }else{
 770                 src_filter.chrH= getIdentityVec();
 771                 src_filter.chrV= getIdentityVec();
 772         }
 773
 774         if(sws_chr_sharpen!=0.0){
 775                 SwsVector *g= getConstVec(-1.0, 3);
 776                 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
 777                 g->coeff[1]=2.0;
 778                 addVec(id, g);
 779                 convVec(src_filter.chrH, id);
 780                 convVec(src_filter.chrV, id);
 781                 freeVec(g);
 782                 freeVec(id);
 783         }
 784
 785         if(sws_lum_sharpen!=0.0){
 786                 SwsVector *g= getConstVec(-1.0, 3);
 787                 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
 788                 g->coeff[1]=2.0;
 789                 addVec(id, g);
 790                 convVec(src_filter.lumH, id);
 791                 convVec(src_filter.lumV, id);
 792                 freeVec(g);
 793                 freeVec(id);
 794         }
 795
 796         if(sws_chr_hshift)
 797                 shiftVec(src_filter.chrH, sws_chr_hshift);
 798
 799         if(sws_chr_vshift)
 800                 shiftVec(src_filter.chrV, sws_chr_vshift);
 801
 802         normalizeVec(src_filter.chrH, 1.0);
 803         normalizeVec(src_filter.chrV, 1.0);
 804         normalizeVec(src_filter.lumH, 1.0);
 805         normalizeVec(src_filter.lumV, 1.0);
 806
 807         if(verbose > 1) printVec(src_filter.chrH);
 808         if(verbose > 1) printVec(src_filter.lumH);
 809
 810         switch(sws_flags)
 811         {
 812                 case 0: flags|= SWS_FAST_BILINEAR; break;
 813                 case 1: flags|= SWS_BILINEAR; break;
 814                 case 2: flags|= SWS_BICUBIC; break;
 815                 case 3: flags|= SWS_X; break;
 816                 case 4: flags|= SWS_POINT; break;
 817                 case 5: flags|= SWS_AREA; break;
 818                 default:flags|= SWS_BILINEAR; break;
 819         }
 820
 821         return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
 822 }
 823
 824
 825 static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
 826                               int srcW, int dstW, int filterAlign, int one, int flags,
 827                               SwsVector *srcFilter, SwsVector *dstFilter)
 828 {
 829         int i;
 830         int filterSize;
 831         int filter2Size;
 832         int minFilterSize;
 833         double *filter=NULL;
 834         double *filter2=NULL;
 835 #ifdef ARCH_X86
 836         if(gCpuCaps.hasMMX)
 837                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
 838 #endif
 839
 840         // Note the +1 is for the MMXscaler which reads over the end
 841         *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
 842
 843         if(ABS(xInc - 0x10000) <10) // unscaled
 844         {
 845                 int i;
 846                 filterSize= 1;
 847                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 848                 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
 849
 850                 for(i=0; i<dstW; i++)
 851                 {
 852                         filter[i*filterSize]=1;
 853                         (*filterPos)[i]=i;
 854                 }
 855
 856         }
 857         else if(flags&SWS_POINT) // lame looking point sampling mode
 858         {
 859                 int i;
 860                 int xDstInSrc;
 861                 filterSize= 1;
 862                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 863
 864                 xDstInSrc= xInc/2 - 0x8000;
 865                 for(i=0; i<dstW; i++)
 866                 {
 867                         int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
 868
 869                         (*filterPos)[i]= xx;
 870                         filter[i]= 1.0;
 871                         xDstInSrc+= xInc;
 872                 }
 873         }
 874         else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
 875         {
 876                 int i;
 877                 int xDstInSrc;
 878                 if     (flags&SWS_BICUBIC) filterSize= 4;
 879                 else if(flags&SWS_X      ) filterSize= 4;
 880                 else                       filterSize= 2; // SWS_BILINEAR / SWS_AREA
 881                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 882
 883                 xDstInSrc= xInc/2 - 0x8000;
 884                 for(i=0; i<dstW; i++)
 885                 {
 886                         int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
 887                         int j;
 888
 889                         (*filterPos)[i]= xx;
 890                         if((flags & SWS_BICUBIC) || (flags & SWS_X))
 891                         {
 892                                 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
 893                                 double y1,y2,y3,y4;
 894                                 double A= -0.6;
 895                                 if(flags & SWS_BICUBIC){
 896                                                 // Equation is from VirtualDub
 897                                         y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
 898                                         y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
 899                                         y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
 900                                         y4 = (                  +           A*d*d -       A*d*d*d);
 901                                 }else{
 902                                                 // cubic interpolation (derived it myself)
 903                                         y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
 904                                         y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
 905                                         y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
 906                                         y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
 907                                 }
 908
 909                                 filter[i*filterSize + 0]= y1;
 910                                 filter[i*filterSize + 1]= y2;
 911                                 filter[i*filterSize + 2]= y3;
 912                                 filter[i*filterSize + 3]= y4;
 913                         }
 914                         else
 915                         {
 916                                 //Bilinear upscale / linear interpolate / Area averaging
 917                                 for(j=0; j<filterSize; j++)
 918                                 {
 919                                         double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
 920                                         double coeff= 1.0 - d;
 921                                         if(coeff<0) coeff=0;
 922                                         filter[i*filterSize + j]= coeff;
 923                                         xx++;
 924                                 }
 925                         }
 926                         xDstInSrc+= xInc;
 927                 }
 928         }
 929         else // downscale
 930         {
 931                 int xDstInSrc;
 932                 ASSERT(dstW <= srcW)
 933
 934                 if(flags&SWS_BICUBIC)   filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
 935                 else if(flags&SWS_X)    filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
 936                 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
 937                 else /* BILINEAR */     filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
 938                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 939
 940                 xDstInSrc= xInc/2 - 0x8000;
 941                 for(i=0; i<dstW; i++)
 942                 {
 943                         int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
 944                         int j;
 945                         (*filterPos)[i]= xx;
 946                         for(j=0; j<filterSize; j++)
 947                         {
 948                                 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
 949                                 double coeff;
 950                                 if((flags & SWS_BICUBIC) || (flags & SWS_X))
 951                                 {
 952                                         double A= -0.75;
 953 //                                      d*=2;
 954                                         // Equation is from VirtualDub
 955                                         if(d<1.0)
 956                                                 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
 957                                         else if(d<2.0)
 958                                                 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
 959                                         else
 960                                                 coeff=0.0;
 961                                 }
 962                                 else if(flags & SWS_AREA)
 963                                 {
 964                                         double srcPixelSize= (1<<16)/(double)xInc;
 965                                         if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
 966                                         else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
 967                                         else coeff=0.0;
 968                                 }
 969                                 else
 970                                 {
 971                                         coeff= 1.0 - d;
 972                                         if(coeff<0) coeff=0;
 973                                 }
 974                                 filter[i*filterSize + j]= coeff;
 975                                 xx++;
 976                         }
 977                         xDstInSrc+= xInc;
 978                 }
 979         }
 980
 981         /* apply src & dst Filter to filter -> filter2
 982            free(filter);
 983         */
 984         ASSERT(filterSize>0)
 985         filter2Size= filterSize;
 986         if(srcFilter) filter2Size+= srcFilter->length - 1;
 987         if(dstFilter) filter2Size+= dstFilter->length - 1;
 988         ASSERT(filter2Size>0)
 989         filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
 990
 991         for(i=0; i<dstW; i++)
 992         {
 993                 int j;
 994                 SwsVector scaleFilter;
 995                 SwsVector *outVec;
 996
 997                 scaleFilter.coeff= filter + i*filterSize;
 998                 scaleFilter.length= filterSize;
 999
1000                 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
1001                 else          outVec= &scaleFilter;
1002
1003                 ASSERT(outVec->length == filter2Size)
1004                 //FIXME dstFilter
1005
1006                 for(j=0; j<outVec->length; j++)
1007                 {
1008                         filter2[i*filter2Size + j]= outVec->coeff[j];
1009                 }
1010
1011                 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
1012
1013                 if(outVec != &scaleFilter) freeVec(outVec);
1014         }
1015         free(filter); filter=NULL;
1016
1017         /* try to reduce the filter-size (step1 find size and shift left) */
1018         // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
1019         minFilterSize= 0;
1020         for(i=dstW-1; i>=0; i--)
1021         {
1022                 int min= filter2Size;
1023                 int j;
1024                 double cutOff=0.0;
1025
1026                 /* get rid off near zero elements on the left by shifting left */
1027                 for(j=0; j<filter2Size; j++)
1028                 {
1029                         int k;
1030                         cutOff += ABS(filter2[i*filter2Size]);
1031
1032                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1033
1034                         /* preserve Monotonicity because the core cant handle the filter otherwise */
1035                         if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
1036
1037                         // Move filter coeffs left
1038                         for(k=1; k<filter2Size; k++)
1039                                 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
1040                         filter2[i*filter2Size + k - 1]= 0.0;
1041                         (*filterPos)[i]++;
1042                 }
1043
1044                 cutOff=0.0;
1045                 /* count near zeros on the right */
1046                 for(j=filter2Size-1; j>0; j--)
1047                 {
1048                         cutOff += ABS(filter2[i*filter2Size + j]);
1049
1050                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
1051                         min--;
1052                 }
1053
1054                 if(min>minFilterSize) minFilterSize= min;
1055         }
1056
1057         ASSERT(minFilterSize > 0)
1058         filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
1059         ASSERT(filterSize > 0)
1060         filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
1061         *outFilterSize= filterSize;
1062
1063         if(flags&SWS_PRINT_INFO)
1064                 MSG_INFO("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
1065         /* try to reduce the filter-size (step2 reduce it) */
1066         for(i=0; i<dstW; i++)
1067         {
1068                 int j;
1069
1070                 for(j=0; j<filterSize; j++)
1071                 {
1072                         if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
1073                         else               filter[i*filterSize + j]= filter2[i*filter2Size + j];
1074                 }
1075         }
1076         free(filter2); filter2=NULL;
1077
1078
1079         //FIXME try to align filterpos if possible
1080
1081         //fix borders
1082         for(i=0; i<dstW; i++)
1083         {
1084                 int j;
1085                 if((*filterPos)[i] < 0)
1086                 {
1087                         // Move filter coeffs left to compensate for filterPos
1088                         for(j=1; j<filterSize; j++)
1089                         {
1090                                 int left= MAX(j + (*filterPos)[i], 0);
1091                                 filter[i*filterSize + left] += filter[i*filterSize + j];
1092                                 filter[i*filterSize + j]=0;
1093                         }
1094                         (*filterPos)[i]= 0;
1095                 }
1096
1097                 if((*filterPos)[i] + filterSize > srcW)
1098                 {
1099                         int shift= (*filterPos)[i] + filterSize - srcW;
1100                         // Move filter coeffs right to compensate for filterPos
1101                         for(j=filterSize-2; j>=0; j--)
1102                         {
1103                                 int right= MIN(j + shift, filterSize-1);
1104                                 filter[i*filterSize +right] += filter[i*filterSize +j];
1105                                 filter[i*filterSize +j]=0;
1106                         }
1107                         (*filterPos)[i]= srcW - filterSize;
1108                 }
1109         }
1110
1111         // Note the +1 is for the MMXscaler which reads over the end
1112         *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
1113         memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
1114
1115         /* Normalize & Store in outFilter */
1116         for(i=0; i<dstW; i++)
1117         {
1118                 int j;
1119                 double sum=0;
1120                 double scale= one;
1121                 for(j=0; j<filterSize; j++)
1122                 {
1123                         sum+= filter[i*filterSize + j];
1124                 }
1125                 scale/= sum;
1126                 for(j=0; j<filterSize; j++)
1127                 {
1128                         (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
1129                 }
1130         }
1131
1132         (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1133         for(i=0; i<*outFilterSize; i++)
1134         {
1135                 int j= dstW*(*outFilterSize);
1136                 (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1137         }
1138
1139         free(filter);
1140 }
1141
1142 #ifdef ARCH_X86
1143 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *filter, int32_t *filterPos, int numSplits)
1144 {
1145         uint8_t *fragmentA;
1146         int imm8OfPShufW1A;
1147         int imm8OfPShufW2A;
1148         int fragmentLengthA;
1149         uint8_t *fragmentB;
1150         int imm8OfPShufW1B;
1151         int imm8OfPShufW2B;
1152         int fragmentLengthB;
1153         int fragmentPos;
1154
1155         int xpos, i;
1156
1157         // create an optimized horizontal scaling routine
1158
1159         //code fragment
1160
1161         asm volatile(
1162                 "jmp 9f                         \n\t"
1163         // Begin
1164                 "0:                             \n\t"
1165                 "movq (%%edx, %%eax), %%mm3     \n\t"
1166                 "movd (%%ecx, %%esi), %%mm0     \n\t"
1167                 "movd 1(%%ecx, %%esi), %%mm1    \n\t"
1168                 "punpcklbw %%mm7, %%mm1         \n\t"
1169                 "punpcklbw %%mm7, %%mm0         \n\t"
1170                 "pshufw $0xFF, %%mm1, %%mm1     \n\t"
1171                 "1:                             \n\t"
1172                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
1173                 "2:                             \n\t"
1174                 "psubw %%mm1, %%mm0             \n\t"
1175                 "movl 8(%%ebx, %%eax), %%esi    \n\t"
1176                 "pmullw %%mm3, %%mm0            \n\t"
1177                 "psllw $7, %%mm1                \n\t"
1178                 "paddw %%mm1, %%mm0             \n\t"
1179
1180                 "movq %%mm0, (%%edi, %%eax)     \n\t"
1181
1182                 "addl $8, %%eax                 \n\t"
1183         // End
1184                 "9:                             \n\t"
1185 //              "int $3\n\t"
1186                 "leal 0b, %0                    \n\t"
1187                 "leal 1b, %1                    \n\t"
1188                 "leal 2b, %2                    \n\t"
1189                 "decl %1                        \n\t"
1190                 "decl %2                        \n\t"
1191                 "subl %0, %1                    \n\t"
1192                 "subl %0, %2                    \n\t"
1193                 "leal 9b, %3                    \n\t"
1194                 "subl %0, %3                    \n\t"
1195
1196
1197                 :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1198                 "=r" (fragmentLengthA)
1199         );
1200
1201         asm volatile(
1202                 "jmp 9f                         \n\t"
1203         // Begin
1204                 "0:                             \n\t"
1205                 "movq (%%edx, %%eax), %%mm3     \n\t"
1206                 "movd (%%ecx, %%esi), %%mm0     \n\t"
1207                 "punpcklbw %%mm7, %%mm0         \n\t"
1208                 "pshufw $0xFF, %%mm0, %%mm1     \n\t"
1209                 "1:                             \n\t"
1210                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
1211                 "2:                             \n\t"
1212                 "psubw %%mm1, %%mm0             \n\t"
1213                 "movl 8(%%ebx, %%eax), %%esi    \n\t"
1214                 "pmullw %%mm3, %%mm0            \n\t"
1215                 "psllw $7, %%mm1                \n\t"
1216                 "paddw %%mm1, %%mm0             \n\t"
1217
1218                 "movq %%mm0, (%%edi, %%eax)     \n\t"
1219
1220                 "addl $8, %%eax                 \n\t"
1221         // End
1222                 "9:                             \n\t"
1223 //              "int $3\n\t"
1224                 "leal 0b, %0                    \n\t"
1225                 "leal 1b, %1                    \n\t"
1226                 "leal 2b, %2                    \n\t"
1227                 "decl %1                        \n\t"
1228                 "decl %2                        \n\t"
1229                 "subl %0, %1                    \n\t"
1230                 "subl %0, %2                    \n\t"
1231                 "leal 9b, %3                    \n\t"
1232                 "subl %0, %3                    \n\t"
1233
1234
1235                 :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1236                 "=r" (fragmentLengthB)
1237         );
1238
1239         xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1240         fragmentPos=0;
1241
1242         for(i=0; i<dstW/numSplits; i++)
1243         {
1244                 int xx=xpos>>16;
1245
1246                 if((i&3) == 0)
1247                 {
1248                         int a=0;
1249                         int b=((xpos+xInc)>>16) - xx;
1250                         int c=((xpos+xInc*2)>>16) - xx;
1251                         int d=((xpos+xInc*3)>>16) - xx;
1252
1253                         filter[i  ] = (( xpos         & 0xFFFF) ^ 0xFFFF)>>9;
1254                         filter[i+1] = (((xpos+xInc  ) & 0xFFFF) ^ 0xFFFF)>>9;
1255                         filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1256                         filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1257                         filterPos[i/2]= xx;
1258
1259                         if(d+1<4)
1260                         {
1261                                 int maxShift= 3-(d+1);
1262                                 int shift=0;
1263
1264                                 memcpy(funnyCode + fragmentPos, fragmentB, fragmentLengthB);
1265
1266                                 funnyCode[fragmentPos + imm8OfPShufW1B]=
1267                                         (a+1) | ((b+1)<<2) | ((c+1)<<4) | ((d+1)<<6);
1268                                 funnyCode[fragmentPos + imm8OfPShufW2B]=
1269                                         a | (b<<2) | (c<<4) | (d<<6);
1270
1271                                 if(i+3>=dstW) shift=maxShift; //avoid overread
1272                                 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1273
1274                                 if(shift && i>=shift)
1275                                 {
1276                                         funnyCode[fragmentPos + imm8OfPShufW1B]+= 0x55*shift;
1277                                         funnyCode[fragmentPos + imm8OfPShufW2B]+= 0x55*shift;
1278                                         filterPos[i/2]-=shift;
1279                                 }
1280
1281                                 fragmentPos+= fragmentLengthB;
1282                         }
1283                         else
1284                         {
1285                                 int maxShift= 3-d;
1286                                 int shift=0;
1287
1288                                 memcpy(funnyCode + fragmentPos, fragmentA, fragmentLengthA);
1289
1290                                 funnyCode[fragmentPos + imm8OfPShufW1A]=
1291                                 funnyCode[fragmentPos + imm8OfPShufW2A]=
1292                                         a | (b<<2) | (c<<4) | (d<<6);
1293
1294                                 if(i+4>=dstW) shift=maxShift; //avoid overread
1295                                 else if((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //partial align
1296
1297                                 if(shift && i>=shift)
1298                                 {
1299                                         funnyCode[fragmentPos + imm8OfPShufW1A]+= 0x55*shift;
1300                                         funnyCode[fragmentPos + imm8OfPShufW2A]+= 0x55*shift;
1301                                         filterPos[i/2]-=shift;
1302                                 }
1303
1304                                 fragmentPos+= fragmentLengthA;
1305                         }
1306
1307                         funnyCode[fragmentPos]= RET;
1308                 }
1309                 xpos+=xInc;
1310         }
1311         filterPos[i/2]= xpos>>16; // needed to jump to the next part
1312 }
1313 #endif // ARCH_X86
1314
1315 //FIXME remove
1316 void SwScale_Init(){
1317 }
1318
1319 static void globalInit(){
1320     // generating tables:
1321     int i;
1322     for(i=0; i<768; i++){
1323         int c= MIN(MAX(i-256, 0), 255);
1324         clip_table[i]=c;
1325         yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1326         yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1327         yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1328         yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1329         yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1330     }
1331
1332     for(i=0; i<768; i++)
1333     {
1334         int v= clip_table[i];
1335         clip_table16b[i]=  v>>3;
1336         clip_table16g[i]= (v<<3)&0x07E0;
1337         clip_table16r[i]= (v<<8)&0xF800;
1338         clip_table15b[i]=  v>>3;
1339         clip_table15g[i]= (v<<2)&0x03E0;
1340         clip_table15r[i]= (v<<7)&0x7C00;
1341     }
1342
1343 cpuCaps= gCpuCaps;
1344
1345 #ifdef RUNTIME_CPUDETECT
1346 #ifdef CAN_COMPILE_X86_ASM
1347         // ordered per speed fasterst first
1348         if(gCpuCaps.hasMMX2)
1349                 swScale= swScale_MMX2;
1350         else if(gCpuCaps.has3DNow)
1351                 swScale= swScale_3DNow;
1352         else if(gCpuCaps.hasMMX)
1353                 swScale= swScale_MMX;
1354         else
1355                 swScale= swScale_C;
1356
1357 #else
1358         swScale= swScale_C;
1359         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1360 #endif
1361 #else //RUNTIME_CPUDETECT
1362 #ifdef HAVE_MMX2
1363         swScale= swScale_MMX2;
1364         cpuCaps.has3DNow = 0;
1365 #elif defined (HAVE_3DNOW)
1366         swScale= swScale_3DNow;
1367         cpuCaps.hasMMX2 = 0;
1368 #elif defined (HAVE_MMX)
1369         swScale= swScale_MMX;
1370         cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1371 #else
1372         swScale= swScale_C;
1373         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1374 #endif
1375 #endif //!RUNTIME_CPUDETECT
1376 }
1377
1378 static void PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1379              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1380         uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1381         /* Copy Y plane */
1382         if(dstStride[0]==srcStride[0])
1383                 memcpy(dst, src[0], srcSliceH*dstStride[0]);
1384         else
1385         {
1386                 int i;
1387                 uint8_t *srcPtr= src[0];
1388                 uint8_t *dstPtr= dst;
1389                 for(i=0; i<srcSliceH; i++)
1390                 {
1391                         memcpy(dstPtr, srcPtr, srcStride[0]);
1392                         srcPtr+= srcStride[0];
1393                         dstPtr+= dstStride[0];
1394                 }
1395         }
1396         dst = dstParam[1] + dstStride[1]*srcSliceY;
1397         if(c->srcFormat==IMGFMT_YV12)
1398                 interleaveBytes( src[1],src[2],dst,c->srcW,srcSliceH,srcStride[1],srcStride[2],dstStride[0] );
1399         else /* I420 & IYUV */
1400                 interleaveBytes( src[2],src[1],dst,c->srcW,srcSliceH,srcStride[2],srcStride[1],dstStride[0] );
1401 }
1402
1403
1404 /* Warper functions for yuv2bgr */
1405 static void planarYuvToBgr(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1406              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1407         uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1408
1409         if(c->srcFormat==IMGFMT_YV12)
1410                 yuv2rgb( dst,src[0],src[1],src[2],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1411         else /* I420 & IYUV */
1412                 yuv2rgb( dst,src[0],src[2],src[1],c->srcW,srcSliceH,dstStride[0],srcStride[0],srcStride[1] );
1413 }
1414
1415 static void PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1416              int srcSliceH, uint8_t* dstParam[], int dstStride[]){
1417         uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1418
1419         if(c->srcFormat==IMGFMT_YV12)
1420                 yv12toyuy2( src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1421         else /* I420 & IYUV */
1422                 yv12toyuy2( src[0],src[2],src[1],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0] );
1423 }
1424
1425 static void bgr24to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1426              int srcSliceH, uint8_t* dst[], int dstStride[]){
1427
1428         if(dstStride[0]*3==srcStride[0]*4)
1429                 rgb24to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1430         else
1431         {
1432                 int i;
1433                 uint8_t *srcPtr= src[0];
1434                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1435
1436                 for(i=0; i<srcSliceH; i++)
1437                 {
1438                         rgb24to32(srcPtr, dstPtr, c->srcW*3);
1439                         srcPtr+= srcStride[0];
1440                         dstPtr+= dstStride[0];
1441                 }
1442         }
1443 }
1444
1445 static void bgr24to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1446              int srcSliceH, uint8_t* dst[], int dstStride[]){
1447
1448         if(dstStride[0]*3==srcStride[0]*2)
1449                 rgb24to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1450         else
1451         {
1452                 int i;
1453                 uint8_t *srcPtr= src[0];
1454                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1455
1456                 for(i=0; i<srcSliceH; i++)
1457                 {
1458                         rgb24to16(srcPtr, dstPtr, c->srcW*3);
1459                         srcPtr+= srcStride[0];
1460                         dstPtr+= dstStride[0];
1461                 }
1462         }
1463 }
1464
1465 static void bgr24to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1466              int srcSliceH, uint8_t* dst[], int dstStride[]){
1467
1468         if(dstStride[0]*3==srcStride[0]*2)
1469                 rgb24to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1470         else
1471         {
1472                 int i;
1473                 uint8_t *srcPtr= src[0];
1474                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1475
1476                 for(i=0; i<srcSliceH; i++)
1477                 {
1478                         rgb24to15(srcPtr, dstPtr, c->srcW*3);
1479                         srcPtr+= srcStride[0];
1480                         dstPtr+= dstStride[0];
1481                 }
1482         }
1483 }
1484
1485 static void bgr32to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1486              int srcSliceH, uint8_t* dst[], int dstStride[]){
1487
1488         if(dstStride[0]*4==srcStride[0]*3)
1489                 rgb32to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1490         else
1491         {
1492                 int i;
1493                 uint8_t *srcPtr= src[0];
1494                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1495
1496                 for(i=0; i<srcSliceH; i++)
1497                 {
1498                         rgb32to24(srcPtr, dstPtr, c->srcW<<2);
1499                         srcPtr+= srcStride[0];
1500                         dstPtr+= dstStride[0];
1501                 }
1502         }
1503 }
1504
1505 static void bgr32to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1506              int srcSliceH, uint8_t* dst[], int dstStride[]){
1507
1508         if(dstStride[0]*4==srcStride[0]*2)
1509                 rgb32to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1510         else
1511         {
1512                 int i;
1513                 uint8_t *srcPtr= src[0];
1514                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1515
1516                 for(i=0; i<srcSliceH; i++)
1517                 {
1518                         rgb32to16(srcPtr, dstPtr, c->srcW<<2);
1519                         srcPtr+= srcStride[0];
1520                         dstPtr+= dstStride[0];
1521                 }
1522         }
1523 }
1524
1525 static void bgr32to15Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1526              int srcSliceH, uint8_t* dst[], int dstStride[]){
1527
1528         if(dstStride[0]*4==srcStride[0]*2)
1529                 rgb32to15(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1530         else
1531         {
1532                 int i;
1533                 uint8_t *srcPtr= src[0];
1534                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1535
1536                 for(i=0; i<srcSliceH; i++)
1537                 {
1538                         rgb32to15(srcPtr, dstPtr, c->srcW<<2);
1539                         srcPtr+= srcStride[0];
1540                         dstPtr+= dstStride[0];
1541                 }
1542         }
1543 }
1544
1545 static void bgr15to16Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1546              int srcSliceH, uint8_t* dst[], int dstStride[]){
1547
1548         if(dstStride[0]==srcStride[0])
1549                 rgb15to16(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1550         else
1551         {
1552                 int i;
1553                 uint8_t *srcPtr= src[0];
1554                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1555
1556                 for(i=0; i<srcSliceH; i++)
1557                 {
1558                         rgb15to16(srcPtr, dstPtr, c->srcW<<1);
1559                         srcPtr+= srcStride[0];
1560                         dstPtr+= dstStride[0];
1561                 }
1562         }
1563 }
1564
1565 static void bgr15to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1566              int srcSliceH, uint8_t* dst[], int dstStride[]){
1567
1568         if(dstStride[0]*2==srcStride[0]*3)
1569                 rgb15to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1570         else
1571         {
1572                 int i;
1573                 uint8_t *srcPtr= src[0];
1574                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1575
1576                 for(i=0; i<srcSliceH; i++)
1577                 {
1578                         rgb15to24(srcPtr, dstPtr, c->srcW<<1);
1579                         srcPtr+= srcStride[0];
1580                         dstPtr+= dstStride[0];
1581                 }
1582         }
1583 }
1584
1585 static void bgr15to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1586              int srcSliceH, uint8_t* dst[], int dstStride[]){
1587
1588         if(dstStride[0]*2==srcStride[0]*4)
1589                 rgb15to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1590         else
1591         {
1592                 int i;
1593                 uint8_t *srcPtr= src[0];
1594                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1595
1596                 for(i=0; i<srcSliceH; i++)
1597                 {
1598                         rgb15to32(srcPtr, dstPtr, c->srcW<<1);
1599                         srcPtr+= srcStride[0];
1600                         dstPtr+= dstStride[0];
1601                 }
1602         }
1603 }
1604
1605 static void bgr16to24Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1606              int srcSliceH, uint8_t* dst[], int dstStride[]){
1607
1608         if(dstStride[0]*2==srcStride[0]*3)
1609                 rgb16to24(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1610         else
1611         {
1612                 int i;
1613                 uint8_t *srcPtr= src[0];
1614                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1615
1616                 for(i=0; i<srcSliceH; i++)
1617                 {
1618                         rgb16to24(srcPtr, dstPtr, c->srcW<<1);
1619                         srcPtr+= srcStride[0];
1620                         dstPtr+= dstStride[0];
1621                 }
1622         }
1623 }
1624
1625 static void bgr16to32Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1626              int srcSliceH, uint8_t* dst[], int dstStride[]){
1627
1628         if(dstStride[0]*2==srcStride[0]*4)
1629                 rgb16to32(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
1630         else
1631         {
1632                 int i;
1633                 uint8_t *srcPtr= src[0];
1634                 uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1635
1636                 for(i=0; i<srcSliceH; i++)
1637                 {
1638                         rgb16to32(srcPtr, dstPtr, c->srcW<<1);
1639                         srcPtr+= srcStride[0];
1640                         dstPtr+= dstStride[0];
1641                 }
1642         }
1643 }
1644
1645 static void bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1646              int srcSliceH, uint8_t* dst[], int dstStride[]){
1647
1648         rgb24toyv12(
1649                 src[0],
1650                 dst[0]+ srcSliceY    *dstStride[0],
1651                 dst[1]+(srcSliceY>>1)*dstStride[1],
1652                 dst[2]+(srcSliceY>>1)*dstStride[2],
1653                 c->srcW, srcSliceH,
1654                 dstStride[0], dstStride[1], srcStride[0]);
1655 }
1656
1657 /**
1658  * bring pointers in YUV order instead of YVU
1659  */
1660 static inline void orderYUV(int format, uint8_t * sortedP[], int sortedStride[], uint8_t * p[], int stride[]){
1661         if(format == IMGFMT_YV12 || format == IMGFMT_YVU9){
1662                 sortedP[0]= p[0];
1663                 sortedP[1]= p[1];
1664                 sortedP[2]= p[2];
1665                 sortedStride[0]= stride[0];
1666                 sortedStride[1]= stride[1];
1667                 sortedStride[2]= stride[2];
1668         }
1669         else if(isPacked(format) || isGray(format))
1670         {
1671                 sortedP[0]= p[0];
1672                 sortedP[1]=
1673                 sortedP[2]= NULL;
1674                 sortedStride[0]= stride[0];
1675                 sortedStride[1]=
1676                 sortedStride[2]= 0;
1677         }
1678         else /* I420 */
1679         {
1680                 sortedP[0]= p[0];
1681                 sortedP[1]= p[2];
1682                 sortedP[2]= p[1];
1683                 sortedStride[0]= stride[0];
1684                 sortedStride[1]= stride[2];
1685                 sortedStride[2]= stride[1];
1686         }
1687 }
1688
1689 /* unscaled copy like stuff (assumes nearly identical formats) */
1690 static void simpleCopy(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
1691              int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
1692
1693         int srcStride[3];
1694         int dstStride[3];
1695         uint8_t *src[3];
1696         uint8_t *dst[3];
1697
1698         orderYUV(c->srcFormat, src, srcStride, srcParam, srcStrideParam);
1699         orderYUV(c->dstFormat, dst, dstStride, dstParam, dstStrideParam);
1700
1701         if(isPacked(c->srcFormat))
1702         {
1703                 if(dstStride[0]==srcStride[0])
1704                         memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
1705                 else
1706                 {
1707                         int i;
1708                         uint8_t *srcPtr= src[0];
1709                         uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
1710                         int length=0;
1711
1712                         /* universal length finder */
1713                         while(length+c->srcW <= ABS(dstStride[0])
1714                            && length+c->srcW <= ABS(srcStride[0])) length+= c->srcW;
1715                         ASSERT(length!=0);
1716
1717                         for(i=0; i<srcSliceH; i++)
1718                         {
1719                                 memcpy(dstPtr, srcPtr, length);
1720                                 srcPtr+= srcStride[0];
1721                                 dstPtr+= dstStride[0];
1722                         }
1723                 }
1724         }
1725         else
1726         { /* Planar YUV or gray */
1727                 int plane;
1728                 for(plane=0; plane<3; plane++)
1729                 {
1730                         int length= plane==0 ? c->srcW  : -((-c->srcW  )>>c->chrDstHSubSample);
1731                         int y=      plane==0 ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
1732                         int height= plane==0 ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
1733
1734                         if((isGray(c->srcFormat) || isGray(c->dstFormat)) && plane>0)
1735                         {
1736                                 if(!isGray(c->dstFormat))
1737                                         memset(dst[plane], 128, dstStride[plane]*height);
1738                         }
1739                         else
1740                         {
1741                                 if(dstStride[plane]==srcStride[plane])
1742                                         memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
1743                                 else
1744                                 {
1745                                         int i;
1746                                         uint8_t *srcPtr= src[plane];
1747                                         uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
1748                                         for(i=0; i<height; i++)
1749                                         {
1750                                                 memcpy(dstPtr, srcPtr, length);
1751                                                 srcPtr+= srcStride[plane];
1752                                                 dstPtr+= dstStride[plane];
1753                                         }
1754                                 }
1755                         }
1756                 }
1757         }
1758 }
1759
1760 static int remove_dup_fourcc(int fourcc)
1761 {
1762         switch(fourcc)
1763         {
1764             case IMGFMT_IYUV: return IMGFMT_I420;
1765             case IMGFMT_Y8  : return IMGFMT_Y800;
1766             case IMGFMT_IF09: return IMGFMT_YVU9;
1767             default: return fourcc;
1768         }
1769 }
1770
1771 static void getSubSampleFactors(int *h, int *v, int format){
1772         switch(format){
1773         case IMGFMT_YUY2:
1774                 *h=1;
1775                 *v=0;
1776                 break;
1777         case IMGFMT_YV12:
1778         case IMGFMT_I420:
1779         case IMGFMT_Y800: //FIXME remove after different subsamplings are fully implemented
1780                 *h=1;
1781                 *v=1;
1782                 break;
1783         case IMGFMT_YVU9:
1784                 *h=2;
1785                 *v=2;
1786                 break;
1787         default:
1788                 *h=0;
1789                 *v=0;
1790                 break;
1791         }
1792 }
1793
1794 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1795                          SwsFilter *srcFilter, SwsFilter *dstFilter){
1796
1797         SwsContext *c;
1798         int i;
1799         int usesFilter;
1800         int unscaled;
1801         SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1802 #ifdef ARCH_X86
1803         if(gCpuCaps.hasMMX)
1804                 asm volatile("emms\n\t"::: "memory");
1805 #endif
1806         if(swScale==NULL) globalInit();
1807 //srcFormat= IMGFMT_Y800;
1808 //srcFormat= IMGFMT_YVU9;
1809         /* avoid dupplicate Formats, so we dont need to check to much */
1810         srcFormat = remove_dup_fourcc(srcFormat);
1811         dstFormat = remove_dup_fourcc(dstFormat);
1812
1813         unscaled = (srcW == dstW && srcH == dstH);
1814
1815         if(!isSupportedIn(srcFormat))
1816         {
1817                 MSG_ERR("swScaler: %s is not supported as input format\n", vo_format_name(srcFormat));
1818                 return NULL;
1819         }
1820         if(!isSupportedOut(dstFormat))
1821         {
1822                 MSG_ERR("swScaler: %s is not supported as output format\n", vo_format_name(dstFormat));
1823                 return NULL;
1824         }
1825
1826         /* sanity check */
1827         if(srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1828         {
1829                  MSG_ERR("swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
1830                         srcW, srcH, dstW, dstH);
1831                 return NULL;
1832         }
1833
1834         if(!dstFilter) dstFilter= &dummyFilter;
1835         if(!srcFilter) srcFilter= &dummyFilter;
1836
1837         c= memalign(64, sizeof(SwsContext));
1838         memset(c, 0, sizeof(SwsContext));
1839
1840         c->srcW= srcW;
1841         c->srcH= srcH;
1842         c->dstW= dstW;
1843         c->dstH= dstH;
1844         c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1845         c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1846         c->flags= flags;
1847         c->dstFormat= dstFormat;
1848         c->srcFormat= srcFormat;
1849
1850         usesFilter=0;
1851         if(dstFilter->lumV!=NULL && dstFilter->lumV->length>1) usesFilter=1;
1852         if(dstFilter->lumH!=NULL && dstFilter->lumH->length>1) usesFilter=1;
1853         if(dstFilter->chrV!=NULL && dstFilter->chrV->length>1) usesFilter=1;
1854         if(dstFilter->chrH!=NULL && dstFilter->chrH->length>1) usesFilter=1;
1855         if(srcFilter->lumV!=NULL && srcFilter->lumV->length>1) usesFilter=1;
1856         if(srcFilter->lumH!=NULL && srcFilter->lumH->length>1) usesFilter=1;
1857         if(srcFilter->chrV!=NULL && srcFilter->chrV->length>1) usesFilter=1;
1858         if(srcFilter->chrH!=NULL && srcFilter->chrH->length>1) usesFilter=1;
1859
1860         getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
1861         getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
1862
1863         // reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
1864         if((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
1865
1866         // drop eery 2. pixel for chroma calculation unless user wants full chroma
1867         if((isBGR(srcFormat) || isRGB(srcFormat) || srcFormat==IMGFMT_YUY2) && !(flags&SWS_FULL_CHR_V))
1868                 c->chrSrcVSubSample=1;
1869
1870         // drop eery 2. pixel for chroma calculation unless user wants full chroma
1871         if((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP))
1872                 c->chrSrcHSubSample=1;
1873
1874         c->chrIntHSubSample= c->chrDstHSubSample;
1875         c->chrIntVSubSample= c->chrSrcVSubSample;
1876
1877         // note the -((-x)>>y) is so that we allways round toward +inf
1878         c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
1879         c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
1880         c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
1881         c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
1882 /*      printf("%d %d %d %d / %d %d %d %d //\n",
1883         c->chrSrcW,
1884 c->chrSrcH,
1885 c->chrDstW,
1886 c->chrDstH,
1887 srcW,
1888 srcH,
1889 dstW,
1890 dstH);*/
1891
1892         /* unscaled special Cases */
1893         if(unscaled && !usesFilter)
1894         {
1895                 /* yv12_to_nv12 */
1896                 if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_NV12)
1897                 {
1898                         c->swScale= PlanarToNV12Wrapper;
1899
1900                         if(flags&SWS_PRINT_INFO)
1901                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1902                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1903                         return c;
1904                 }
1905                 /* yv12_to_yuy2 */
1906                 if((srcFormat == IMGFMT_YV12||srcFormat==IMGFMT_I420)&&dstFormat == IMGFMT_YUY2)
1907                 {
1908                         c->swScale= PlanarToYuy2Wrapper;
1909
1910                         if(flags&SWS_PRINT_INFO)
1911                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1912                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1913                         return c;
1914                 }
1915                 /* yuv2bgr */
1916                 if((srcFormat==IMGFMT_YV12 || srcFormat==IMGFMT_I420) && isBGR(dstFormat))
1917                 {
1918                         // FIXME multiple yuv2rgb converters wont work that way cuz that thing is full of globals&statics
1919 #ifdef WORDS_BIGENDIAN
1920                         if(dstFormat==IMGFMT_BGR32)
1921                                 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_BGR);
1922                         else
1923                                 yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1924 #else
1925                         yuv2rgb_init( dstFormat&0xFF /* =bpp */, MODE_RGB);
1926 #endif
1927                         c->swScale= planarYuvToBgr;
1928
1929                         if(flags&SWS_PRINT_INFO)
1930                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1931                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1932                         return c;
1933                 }
1934 #if 1
1935                 /* simple copy */
1936                 if(   srcFormat == dstFormat
1937                    || (srcFormat==IMGFMT_YV12 && dstFormat==IMGFMT_I420)
1938                    || (srcFormat==IMGFMT_I420 && dstFormat==IMGFMT_YV12)
1939                    || (isPlanarYUV(srcFormat) && isGray(dstFormat))
1940                    || (isPlanarYUV(dstFormat) && isGray(srcFormat))
1941                   )
1942                 {
1943                         c->swScale= simpleCopy;
1944
1945                         if(flags&SWS_PRINT_INFO)
1946                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1947                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1948                         return c;
1949                 }
1950 #endif
1951                 /* bgr32to24 & rgb32to24*/
1952                 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR24)
1953                  ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB24))
1954                 {
1955                         c->swScale= bgr32to24Wrapper;
1956
1957                         if(flags&SWS_PRINT_INFO)
1958                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1959                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1960                         return c;
1961                 }
1962
1963                 /* bgr32to16 & rgb32to16*/
1964                 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR16)
1965                  ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB16))
1966                 {
1967                         c->swScale= bgr32to16Wrapper;
1968
1969                         if(flags&SWS_PRINT_INFO)
1970                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1971                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1972                         return c;
1973                 }
1974
1975                 /* bgr32to15 & rgb32to15*/
1976                 if((srcFormat==IMGFMT_BGR32 && dstFormat==IMGFMT_BGR15)
1977                  ||(srcFormat==IMGFMT_RGB32 && dstFormat==IMGFMT_RGB15))
1978                 {
1979                         c->swScale= bgr32to15Wrapper;
1980
1981                         if(flags&SWS_PRINT_INFO)
1982                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1983                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1984                         return c;
1985                 }
1986
1987                 /* bgr24to32 & rgb24to32*/
1988                 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR32)
1989                  ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB32))
1990                 {
1991                         c->swScale= bgr24to32Wrapper;
1992
1993                         if(flags&SWS_PRINT_INFO)
1994                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
1995                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
1996                         return c;
1997                 }
1998
1999                 /* bgr24to16 & rgb24to16*/
2000                 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR16)
2001                  ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB16))
2002                 {
2003                         c->swScale= bgr24to16Wrapper;
2004
2005                         if(flags&SWS_PRINT_INFO)
2006                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2007                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2008                         return c;
2009                 }
2010
2011                 /* bgr24to15 & rgb24to15*/
2012                 if((srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_BGR15)
2013                  ||(srcFormat==IMGFMT_RGB24 && dstFormat==IMGFMT_RGB15))
2014                 {
2015                         c->swScale= bgr24to15Wrapper;
2016
2017                         if(flags&SWS_PRINT_INFO)
2018                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2019                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2020                         return c;
2021                 }
2022
2023                 /* bgr15to16 */
2024                 if(srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR16)
2025                 {
2026                         c->swScale= bgr15to16Wrapper;
2027
2028                         if(flags&SWS_PRINT_INFO)
2029                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2030                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2031                         return c;
2032                 }
2033
2034                 /* bgr15to24 */
2035                 if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR24)
2036                  ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB24))
2037                 {
2038                         c->swScale= bgr15to24Wrapper;
2039
2040                         if(flags&SWS_PRINT_INFO)
2041                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2042                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2043                         return c;
2044                 }
2045
2046 #if 0 //segfaults
2047                 /* bgr15to32 */
2048                 if((srcFormat==IMGFMT_BGR15 && dstFormat==IMGFMT_BGR32)
2049                  ||(srcFormat==IMGFMT_RGB15 && dstFormat==IMGFMT_RGB32))
2050                 {
2051                         c->swScale= bgr15to32Wrapper;
2052
2053                         if(flags&SWS_PRINT_INFO)
2054                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2055                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2056                         return c;
2057                 }
2058 #endif
2059                 /* bgr16to24 */
2060                 if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR24)
2061                  ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB24))
2062                 {
2063                         c->swScale= bgr16to24Wrapper;
2064
2065                         if(flags&SWS_PRINT_INFO)
2066                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2067                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2068                         return c;
2069                 }
2070
2071 #if 0 //segfaults
2072                 /* bgr16to32 */
2073                 if((srcFormat==IMGFMT_BGR16 && dstFormat==IMGFMT_BGR32)
2074                  ||(srcFormat==IMGFMT_RGB16 && dstFormat==IMGFMT_RGB32))
2075                 {
2076                         c->swScale= bgr16to32Wrapper;
2077
2078                         if(flags&SWS_PRINT_INFO)
2079                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2080                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2081                         return c;
2082                 }
2083 #endif
2084                 /* bgr24toYV12 */
2085                 if(srcFormat==IMGFMT_BGR24 && dstFormat==IMGFMT_YV12)
2086                 {
2087                         c->swScale= bgr24toyv12Wrapper;
2088
2089                         if(flags&SWS_PRINT_INFO)
2090                                 MSG_INFO("SwScaler: using unscaled %s -> %s special converter\n",
2091                                         vo_format_name(srcFormat), vo_format_name(dstFormat));
2092                         return c;
2093                 }
2094         }
2095
2096         if(cpuCaps.hasMMX2)
2097         {
2098                 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2099                 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
2100                 {
2101                         if(flags&SWS_PRINT_INFO)
2102                                 MSG_INFO("SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2103                 }
2104         }
2105         else
2106                 c->canMMX2BeUsed=0;
2107
2108         c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
2109         c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
2110
2111         // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2112         // but only for the FAST_BILINEAR mode otherwise do correct scaling
2113         // n-2 is the last chrominance sample available
2114         // this is not perfect, but noone shuld notice the difference, the more correct variant
2115         // would be like the vertical one, but that would require some special code for the
2116         // first and last pixel
2117         if(flags&SWS_FAST_BILINEAR)
2118         {
2119                 if(c->canMMX2BeUsed)
2120                 {
2121                         c->lumXInc+= 20;
2122                         c->chrXInc+= 20;
2123                 }
2124                 //we dont use the x86asm scaler if mmx is available
2125                 else if(cpuCaps.hasMMX)
2126                 {
2127                         c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2128                         c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
2129                 }
2130         }
2131
2132         /* precalculate horizontal scaler filter coefficients */
2133         {
2134                 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
2135
2136                 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
2137                                  srcW      ,       dstW, filterAlign, 1<<14, flags,
2138                                  srcFilter->lumH, dstFilter->lumH);
2139                 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
2140                                  c->chrSrcW, c->chrDstW, filterAlign, 1<<14, flags,
2141                                  srcFilter->chrH, dstFilter->chrH);
2142
2143 #ifdef ARCH_X86
2144 // cant downscale !!!
2145                 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
2146                 {
2147                         c->lumMmx2Filter   = (int16_t*)memalign(8, (dstW        /8+8)*sizeof(int16_t));
2148                         c->chrMmx2Filter   = (int16_t*)memalign(8, (c->chrDstW  /4+8)*sizeof(int16_t));
2149                         c->lumMmx2FilterPos= (int32_t*)memalign(8, (dstW      /2/8+8)*sizeof(int32_t));
2150                         c->chrMmx2FilterPos= (int32_t*)memalign(8, (c->chrDstW/2/4+8)*sizeof(int32_t));
2151
2152                         initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode , c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
2153                         initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
2154                 }
2155 #endif
2156         } // Init Horizontal stuff
2157
2158
2159
2160         /* precalculate vertical scaler filter coefficients */
2161         initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
2162                         srcH      ,        dstH, 1, (1<<12)-4, flags,
2163                         srcFilter->lumV, dstFilter->lumV);
2164         initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
2165                         c->chrSrcH, c->chrDstH, 1, (1<<12)-4, flags,
2166                          srcFilter->chrV, dstFilter->chrV);
2167
2168         // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2169         c->vLumBufSize= c->vLumFilterSize;
2170         c->vChrBufSize= c->vChrFilterSize;
2171         for(i=0; i<dstH; i++)
2172         {
2173                 int chrI= i*c->chrDstH / dstH;
2174                 int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
2175                                  ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
2176                 nextSlice&= ~3; // Slices start at boundaries which are divisable through 4
2177                 if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
2178                         c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
2179                 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample))
2180                         c->vChrBufSize= (nextSlice>>c->chrSrcVSubSample) - c->vChrFilterPos[chrI];
2181         }
2182
2183         // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2184         c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
2185         c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
2186         //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
2187         for(i=0; i<c->vLumBufSize; i++)
2188                 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
2189         for(i=0; i<c->vChrBufSize; i++)
2190                 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
2191
2192         //try to avoid drawing green stuff between the right end and the stride end
2193         for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
2194         for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
2195
2196         ASSERT(c->chrDstH <= dstH)
2197
2198         // pack filter data for mmx code
2199         if(cpuCaps.hasMMX)
2200         {
2201                 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
2202                 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
2203                 for(i=0; i<c->vLumFilterSize*dstH; i++)
2204                         c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
2205                                 c->vLumFilter[i];
2206                 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
2207                         c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
2208                                 c->vChrFilter[i];
2209         }
2210
2211         if(flags&SWS_PRINT_INFO)
2212         {
2213 #ifdef DITHER1XBPP
2214                 char *dither= " dithered";
2215 #else
2216                 char *dither= "";
2217 #endif
2218                 if(flags&SWS_FAST_BILINEAR)
2219                         MSG_INFO("\nSwScaler: FAST_BILINEAR scaler, ");
2220                 else if(flags&SWS_BILINEAR)
2221                         MSG_INFO("\nSwScaler: BILINEAR scaler, ");
2222                 else if(flags&SWS_BICUBIC)
2223                         MSG_INFO("\nSwScaler: BICUBIC scaler, ");
2224                 else if(flags&SWS_X)
2225                         MSG_INFO("\nSwScaler: Experimental scaler, ");
2226                 else if(flags&SWS_POINT)
2227                         MSG_INFO("\nSwScaler: Nearest Neighbor / POINT scaler, ");
2228                 else if(flags&SWS_AREA)
2229                         MSG_INFO("\nSwScaler: Area Averageing scaler, ");
2230                 else
2231                         MSG_INFO("\nSwScaler: ehh flags invalid?! ");
2232
2233                 if(dstFormat==IMGFMT_BGR15 || dstFormat==IMGFMT_BGR16)
2234                         MSG_INFO("from %s to%s %s ",
2235                                 vo_format_name(srcFormat), dither, vo_format_name(dstFormat));
2236                 else
2237                         MSG_INFO("from %s to %s ",
2238                                 vo_format_name(srcFormat), vo_format_name(dstFormat));
2239
2240                 if(cpuCaps.hasMMX2)
2241                         MSG_INFO("using MMX2\n");
2242                 else if(cpuCaps.has3DNow)
2243                         MSG_INFO("using 3DNOW\n");
2244                 else if(cpuCaps.hasMMX)
2245                         MSG_INFO("using MMX\n");
2246                 else
2247                         MSG_INFO("using C\n");
2248         }
2249
2250         if((flags & SWS_PRINT_INFO) && verbose)
2251         {
2252                 if(cpuCaps.hasMMX)
2253                 {
2254                         if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
2255                                 MSG_V("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2256                         else
2257                         {
2258                                 if(c->hLumFilterSize==4)
2259                                         MSG_V("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2260                                 else if(c->hLumFilterSize==8)
2261                                         MSG_V("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2262                                 else
2263                                         MSG_V("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2264
2265                                 if(c->hChrFilterSize==4)
2266                                         MSG_V("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2267                                 else if(c->hChrFilterSize==8)
2268                                         MSG_V("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2269                                 else
2270                                         MSG_V("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2271                         }
2272                 }
2273                 else
2274                 {
2275 #ifdef ARCH_X86
2276                         MSG_V("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2277 #else
2278                         if(flags & SWS_FAST_BILINEAR)
2279                                 MSG_V("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2280                         else
2281                                 MSG_V("SwScaler: using C scaler for horizontal scaling\n");
2282 #endif
2283                 }
2284                 if(isPlanarYUV(dstFormat))
2285                 {
2286                         if(c->vLumFilterSize==1)
2287                                 MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
2288                         else
2289                                 MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
2290                 }
2291                 else
2292                 {
2293                         if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
2294                                 MSG_V("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2295                                        "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
2296                         else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
2297                                 MSG_V("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
2298                         else
2299                                 MSG_V("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
2300                 }
2301
2302                 if(dstFormat==IMGFMT_BGR24)
2303                         MSG_V("SwScaler: using %s YV12->BGR24 Converter\n",
2304                                 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
2305                 else if(dstFormat==IMGFMT_BGR32)
2306                         MSG_V("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2307                 else if(dstFormat==IMGFMT_BGR16)
2308                         MSG_V("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2309                 else if(dstFormat==IMGFMT_BGR15)
2310                         MSG_V("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
2311
2312                 MSG_V("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2313         }
2314         if((flags & SWS_PRINT_INFO) && verbose>1)
2315         {
2316                 MSG_DBG2("SwScaler:Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2317                         c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
2318                 MSG_DBG2("SwScaler:Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2319                         c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
2320         }
2321
2322         c->swScale= swScale;
2323         return c;
2324 }
2325
2326 /**
2327  * returns a normalized gaussian curve used to filter stuff
2328  * quality=3 is high quality, lowwer is lowwer quality
2329  */
2330
2331 SwsVector *getGaussianVec(double variance, double quality){
2332         const int length= (int)(variance*quality + 0.5) | 1;
2333         int i;
2334         double *coeff= memalign(sizeof(double), length*sizeof(double));
2335         double middle= (length-1)*0.5;
2336         SwsVector *vec= malloc(sizeof(SwsVector));
2337
2338         vec->coeff= coeff;
2339         vec->length= length;
2340
2341         for(i=0; i<length; i++)
2342         {
2343                 double dist= i-middle;
2344                 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
2345         }
2346
2347         normalizeVec(vec, 1.0);
2348
2349         return vec;
2350 }
2351
2352 SwsVector *getConstVec(double c, int length){
2353         int i;
2354         double *coeff= memalign(sizeof(double), length*sizeof(double));
2355         SwsVector *vec= malloc(sizeof(SwsVector));
2356
2357         vec->coeff= coeff;
2358         vec->length= length;
2359
2360         for(i=0; i<length; i++)
2361                 coeff[i]= c;
2362
2363         return vec;
2364 }
2365
2366
2367 SwsVector *getIdentityVec(void){
2368         double *coeff= memalign(sizeof(double), sizeof(double));
2369         SwsVector *vec= malloc(sizeof(SwsVector));
2370         coeff[0]= 1.0;
2371
2372         vec->coeff= coeff;
2373         vec->length= 1;
2374
2375         return vec;
2376 }
2377
2378 void normalizeVec(SwsVector *a, double height){
2379         int i;
2380         double sum=0;
2381         double inv;
2382
2383         for(i=0; i<a->length; i++)
2384                 sum+= a->coeff[i];
2385
2386         inv= height/sum;
2387
2388         for(i=0; i<a->length; i++)
2389                 a->coeff[i]*= height;
2390 }
2391
2392 void scaleVec(SwsVector *a, double scalar){
2393         int i;
2394
2395         for(i=0; i<a->length; i++)
2396                 a->coeff[i]*= scalar;
2397 }
2398
2399 static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
2400         int length= a->length + b->length - 1;
2401         double *coeff= memalign(sizeof(double), length*sizeof(double));
2402         int i, j;
2403         SwsVector *vec= malloc(sizeof(SwsVector));
2404
2405         vec->coeff= coeff;
2406         vec->length= length;
2407
2408         for(i=0; i<length; i++) coeff[i]= 0.0;
2409
2410         for(i=0; i<a->length; i++)
2411         {
2412                 for(j=0; j<b->length; j++)
2413                 {
2414                         coeff[i+j]+= a->coeff[i]*b->coeff[j];
2415                 }
2416         }
2417
2418         return vec;
2419 }
2420
2421 static SwsVector *sumVec(SwsVector *a, SwsVector *b){
2422         int length= MAX(a->length, b->length);
2423         double *coeff= memalign(sizeof(double), length*sizeof(double));
2424         int i;
2425         SwsVector *vec= malloc(sizeof(SwsVector));
2426
2427         vec->coeff= coeff;
2428         vec->length= length;
2429
2430         for(i=0; i<length; i++) coeff[i]= 0.0;
2431
2432         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2433         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
2434
2435         return vec;
2436 }
2437
2438 static SwsVector *diffVec(SwsVector *a, SwsVector *b){
2439         int length= MAX(a->length, b->length);
2440         double *coeff= memalign(sizeof(double), length*sizeof(double));
2441         int i;
2442         SwsVector *vec= malloc(sizeof(SwsVector));
2443
2444         vec->coeff= coeff;
2445         vec->length= length;
2446
2447         for(i=0; i<length; i++) coeff[i]= 0.0;
2448
2449         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
2450         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
2451
2452         return vec;
2453 }
2454
2455 /* shift left / or right if "shift" is negative */
2456 static SwsVector *getShiftedVec(SwsVector *a, int shift){
2457         int length= a->length + ABS(shift)*2;
2458         double *coeff= memalign(sizeof(double), length*sizeof(double));
2459         int i;
2460         SwsVector *vec= malloc(sizeof(SwsVector));
2461
2462         vec->coeff= coeff;
2463         vec->length= length;
2464
2465         for(i=0; i<length; i++) coeff[i]= 0.0;
2466
2467         for(i=0; i<a->length; i++)
2468         {
2469                 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
2470         }
2471
2472         return vec;
2473 }
2474
2475 void shiftVec(SwsVector *a, int shift){
2476         SwsVector *shifted= getShiftedVec(a, shift);
2477         free(a->coeff);
2478         a->coeff= shifted->coeff;
2479         a->length= shifted->length;
2480         free(shifted);
2481 }
2482
2483 void addVec(SwsVector *a, SwsVector *b){
2484         SwsVector *sum= sumVec(a, b);
2485         free(a->coeff);
2486         a->coeff= sum->coeff;
2487         a->length= sum->length;
2488         free(sum);
2489 }
2490
2491 void subVec(SwsVector *a, SwsVector *b){
2492         SwsVector *diff= diffVec(a, b);
2493         free(a->coeff);
2494         a->coeff= diff->coeff;
2495         a->length= diff->length;
2496         free(diff);
2497 }
2498
2499 void convVec(SwsVector *a, SwsVector *b){
2500         SwsVector *conv= getConvVec(a, b);
2501         free(a->coeff);
2502         a->coeff= conv->coeff;
2503         a->length= conv->length;
2504         free(conv);
2505 }
2506
2507 SwsVector *cloneVec(SwsVector *a){
2508         double *coeff= memalign(sizeof(double), a->length*sizeof(double));
2509         int i;
2510         SwsVector *vec= malloc(sizeof(SwsVector));
2511
2512         vec->coeff= coeff;
2513         vec->length= a->length;
2514
2515         for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
2516
2517         return vec;
2518 }
2519
2520 void printVec(SwsVector *a){
2521         int i;
2522         double max=0;
2523         double min=0;
2524         double range;
2525
2526         for(i=0; i<a->length; i++)
2527                 if(a->coeff[i]>max) max= a->coeff[i];
2528
2529         for(i=0; i<a->length; i++)
2530                 if(a->coeff[i]<min) min= a->coeff[i];
2531
2532         range= max - min;
2533
2534         for(i=0; i<a->length; i++)
2535         {
2536                 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
2537                 MSG_DBG2("%1.3f ", a->coeff[i]);
2538                 for(;x>0; x--) MSG_DBG2(" ");
2539                 MSG_DBG2("|\n");
2540         }
2541 }
2542
2543 void freeVec(SwsVector *a){
2544         if(!a) return;
2545         if(a->coeff) free(a->coeff);
2546         a->coeff=NULL;
2547         a->length=0;
2548         free(a);
2549 }
2550
2551 void freeSwsContext(SwsContext *c){
2552         int i;
2553         if(!c) return;
2554
2555         if(c->lumPixBuf)
2556         {
2557                 for(i=0; i<c->vLumBufSize; i++)
2558                 {
2559                         if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
2560                         c->lumPixBuf[i]=NULL;
2561                 }
2562                 free(c->lumPixBuf);
2563                 c->lumPixBuf=NULL;
2564         }
2565
2566         if(c->chrPixBuf)
2567         {
2568                 for(i=0; i<c->vChrBufSize; i++)
2569                 {
2570                         if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
2571                         c->chrPixBuf[i]=NULL;
2572                 }
2573                 free(c->chrPixBuf);
2574                 c->chrPixBuf=NULL;
2575         }
2576
2577         if(c->vLumFilter) free(c->vLumFilter);
2578         c->vLumFilter = NULL;
2579         if(c->vChrFilter) free(c->vChrFilter);
2580         c->vChrFilter = NULL;
2581         if(c->hLumFilter) free(c->hLumFilter);
2582         c->hLumFilter = NULL;
2583         if(c->hChrFilter) free(c->hChrFilter);
2584         c->hChrFilter = NULL;
2585
2586         if(c->vLumFilterPos) free(c->vLumFilterPos);
2587         c->vLumFilterPos = NULL;
2588         if(c->vChrFilterPos) free(c->vChrFilterPos);
2589         c->vChrFilterPos = NULL;
2590         if(c->hLumFilterPos) free(c->hLumFilterPos);
2591         c->hLumFilterPos = NULL;
2592         if(c->hChrFilterPos) free(c->hChrFilterPos);
2593         c->hChrFilterPos = NULL;
2594
2595         if(c->lumMmxFilter) free(c->lumMmxFilter);
2596         c->lumMmxFilter = NULL;
2597         if(c->chrMmxFilter) free(c->chrMmxFilter);
2598         c->chrMmxFilter = NULL;
2599
2600         if(c->lumMmx2Filter) free(c->lumMmx2Filter);
2601         c->lumMmx2Filter=NULL;
2602         if(c->chrMmx2Filter) free(c->chrMmx2Filter);
2603         c->chrMmx2Filter=NULL;
2604         if(c->lumMmx2FilterPos) free(c->lumMmx2FilterPos);
2605         c->lumMmx2FilterPos=NULL;
2606         if(c->chrMmx2FilterPos) free(c->chrMmx2FilterPos);
2607         c->chrMmx2FilterPos=NULL;
2608
2609         free(c);
2610 }
2611
2612