git.sesse.net Git - ffmpeg/blob - postproc/swscale.c

   1 /*
   2     Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17 */
  18
  19 /*
  20   supported Input formats: YV12, I420, IYUV (grayscale soon too)
  21   supported output formats: YV12, I420, IYUV, BGR15, BGR16, BGR24, BGR32 (grayscale soon too)
  22   BGR15/16 support dithering
  23 */
  24
  25 #include <inttypes.h>
  26 #include <string.h>
  27 #include <math.h>
  28 #include <stdio.h>
  29 #include "../config.h"
  30 #include "../mangle.h"
  31 #ifdef HAVE_MALLOC_H
  32 #include <malloc.h>
  33 #endif
  34 #include "swscale.h"
  35 #include "../cpudetect.h"
  36 #include "../libvo/img_format.h"
  37 #undef MOVNTQ
  38 #undef PAVGB
  39
  40 //#undef HAVE_MMX2
  41 //#define HAVE_3DNOW
  42 //#undef HAVE_MMX
  43 //#undef ARCH_X86
  44 #define DITHER1XBPP
  45
  46 #define RET 0xC3 //near return opcode
  47
  48 #ifdef MP_DEBUG
  49 #define ASSERT(x) if(!(x)) { printf("ASSERT " #x " failed\n"); *((int*)0)=0; }
  50 #else
  51 #define ASSERT(x) ;
  52 #endif
  53
  54 #ifdef M_PI
  55 #define PI M_PI
  56 #else
  57 #define PI 3.14159265358979323846
  58 #endif
  59
  60 //FIXME replace this with something faster
  61 #define isYUV(x)       ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
  62 #define isPlanarYUV(x) ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
  63 #define isHalfChrV(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
  64 #define isHalfChrH(x)  ((x)==IMGFMT_YV12 || (x)==IMGFMT_I420 || (x)==IMGFMT_IYUV)
  65
  66 extern int verbose; // defined in mplayer.c
  67 /*
  68 NOTES
  69
  70 known BUGS with known cause (no bugreports please!, but patches are welcome :) )
  71 horizontal fast_bilinear MMX2 scaler reads 1-7 samples too much (might cause a sig11)
  72
  73 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  74
  75 TODO
  76 more intelligent missalignment avoidance for the horizontal scaler
  77 change the distance of the u & v buffer
  78 write special vertical cubic upscale version
  79 Optimize C code (yv12 / minmax)
  80 add support for packed pixel yuv input & output
  81 add support for Y8 input & output
  82 add BGR4 output support
  83 add BGR32 / BGR24 input support
  84 */
  85
  86 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
  87 #define MIN(a,b) ((a) > (b) ? (b) : (a))
  88 #define MAX(a,b) ((a) < (b) ? (b) : (a))
  89
  90 #ifdef ARCH_X86
  91 #define CAN_COMPILE_X86_ASM
  92 #endif
  93
  94 #ifdef CAN_COMPILE_X86_ASM
  95 static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
  96 static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
  97 static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
  98 static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
  99 static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
 100 static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
 101 static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
 102 static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
 103 static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
 104 static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
 105 static uint64_t __attribute__((aligned(8))) w02=       0x0002000200020002LL;
 106 static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
 107 static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
 108 static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
 109
 110 static volatile uint64_t __attribute__((aligned(8))) b5Dither;
 111 static volatile uint64_t __attribute__((aligned(8))) g5Dither;
 112 static volatile uint64_t __attribute__((aligned(8))) g6Dither;
 113 static volatile uint64_t __attribute__((aligned(8))) r5Dither;
 114
 115 static uint64_t __attribute__((aligned(8))) dither4[2]={
 116         0x0103010301030103LL,
 117         0x0200020002000200LL,};
 118
 119 static uint64_t __attribute__((aligned(8))) dither8[2]={
 120         0x0602060206020602LL,
 121         0x0004000400040004LL,};
 122
 123 static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
 124 static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
 125 static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
 126 static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
 127 static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
 128 static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
 129
 130 static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
 131 static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
 132 static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
 133
 134 // FIXME remove
 135 static uint64_t __attribute__((aligned(8))) asm_yalpha1;
 136 static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
 137 #endif
 138
 139 // clipping helper table for C implementations:
 140 static unsigned char clip_table[768];
 141
 142 static unsigned short clip_table16b[768];
 143 static unsigned short clip_table16g[768];
 144 static unsigned short clip_table16r[768];
 145 static unsigned short clip_table15b[768];
 146 static unsigned short clip_table15g[768];
 147 static unsigned short clip_table15r[768];
 148
 149 // yuv->rgb conversion tables:
 150 static    int yuvtab_2568[256];
 151 static    int yuvtab_3343[256];
 152 static    int yuvtab_0c92[256];
 153 static    int yuvtab_1a1e[256];
 154 static    int yuvtab_40cf[256];
 155 // Needed for cubic scaler to catch overflows
 156 static    int clip_yuvtab_2568[768];
 157 static    int clip_yuvtab_3343[768];
 158 static    int clip_yuvtab_0c92[768];
 159 static    int clip_yuvtab_1a1e[768];
 160 static    int clip_yuvtab_40cf[768];
 161
 162 //global sws_flags from the command line
 163 int sws_flags=2;
 164
 165 //global srcFilter
 166 SwsFilter src_filter= {NULL, NULL, NULL, NULL};
 167
 168 float sws_lum_gblur= 0.0;
 169 float sws_chr_gblur= 0.0;
 170 int sws_chr_vshift= 0;
 171 int sws_chr_hshift= 0;
 172 float sws_chr_sharpen= 0.0;
 173 float sws_lum_sharpen= 0.0;
 174
 175 /* cpuCaps combined from cpudetect and whats actually compiled in
 176    (if there is no support for something compiled in it wont appear here) */
 177 static CpuCaps cpuCaps;
 178
 179 void (*swScale)(SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
 180              int srcSliceH, uint8_t* dst[], int dstStride[])=NULL;
 181
 182 static SwsVector *getConvVec(SwsVector *a, SwsVector *b);
 183
 184 #ifdef CAN_COMPILE_X86_ASM
 185 void in_asm_used_var_warning_killer()
 186 {
 187  volatile int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
 188  bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+asm_yalpha1+ asm_uvalpha1+
 189  M24A+M24B+M24C+w02 + b5Dither+g5Dither+r5Dither+g6Dither+dither4[0]+dither8[0];
 190  if(i) i=0;
 191 }
 192 #endif
 193
 194 static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 195                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 196                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
 197 {
 198         //FIXME Optimize (just quickly writen not opti..)
 199         int i;
 200         for(i=0; i<dstW; i++)
 201         {
 202                 int val=0;
 203                 int j;
 204                 for(j=0; j<lumFilterSize; j++)
 205                         val += lumSrc[j][i] * lumFilter[j];
 206
 207                 dest[i]= MIN(MAX(val>>19, 0), 255);
 208         }
 209
 210         if(uDest != NULL)
 211                 for(i=0; i<(dstW>>1); i++)
 212                 {
 213                         int u=0;
 214                         int v=0;
 215                         int j;
 216                         for(j=0; j<chrFilterSize; j++)
 217                         {
 218                                 u += chrSrc[j][i] * chrFilter[j];
 219                                 v += chrSrc[j][i + 2048] * chrFilter[j];
 220                         }
 221
 222                         uDest[i]= MIN(MAX(u>>19, 0), 255);
 223                         vDest[i]= MIN(MAX(v>>19, 0), 255);
 224                 }
 225 }
 226
 227 static inline void yuv2rgbXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 228                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 229                                     uint8_t *dest, int dstW, int dstFormat)
 230 {
 231         if(dstFormat==IMGFMT_BGR32)
 232         {
 233                 int i;
 234                 for(i=0; i<(dstW>>1); i++){
 235                         int j;
 236                         int Y1=0;
 237                         int Y2=0;
 238                         int U=0;
 239                         int V=0;
 240                         int Cb, Cr, Cg;
 241                         for(j=0; j<lumFilterSize; j++)
 242                         {
 243                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 244                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 245                         }
 246                         for(j=0; j<chrFilterSize; j++)
 247                         {
 248                                 U += chrSrc[j][i] * chrFilter[j];
 249                                 V += chrSrc[j][i+2048] * chrFilter[j];
 250                         }
 251                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 252                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 253                         U >>= 19;
 254                         V >>= 19;
 255
 256                         Cb= clip_yuvtab_40cf[U+ 256];
 257                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 258                         Cr= clip_yuvtab_3343[V+ 256];
 259
 260                         dest[8*i+0]=clip_table[((Y1 + Cb) >>13)];
 261                         dest[8*i+1]=clip_table[((Y1 + Cg) >>13)];
 262                         dest[8*i+2]=clip_table[((Y1 + Cr) >>13)];
 263
 264                         dest[8*i+4]=clip_table[((Y2 + Cb) >>13)];
 265                         dest[8*i+5]=clip_table[((Y2 + Cg) >>13)];
 266                         dest[8*i+6]=clip_table[((Y2 + Cr) >>13)];
 267                 }
 268         }
 269         else if(dstFormat==IMGFMT_BGR24)
 270         {
 271                 int i;
 272                 for(i=0; i<(dstW>>1); i++){
 273                         int j;
 274                         int Y1=0;
 275                         int Y2=0;
 276                         int U=0;
 277                         int V=0;
 278                         int Cb, Cr, Cg;
 279                         for(j=0; j<lumFilterSize; j++)
 280                         {
 281                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 282                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 283                         }
 284                         for(j=0; j<chrFilterSize; j++)
 285                         {
 286                                 U += chrSrc[j][i] * chrFilter[j];
 287                                 V += chrSrc[j][i+2048] * chrFilter[j];
 288                         }
 289                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 290                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 291                         U >>= 19;
 292                         V >>= 19;
 293
 294                         Cb= clip_yuvtab_40cf[U+ 256];
 295                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 296                         Cr= clip_yuvtab_3343[V+ 256];
 297
 298                         dest[0]=clip_table[((Y1 + Cb) >>13)];
 299                         dest[1]=clip_table[((Y1 + Cg) >>13)];
 300                         dest[2]=clip_table[((Y1 + Cr) >>13)];
 301
 302                         dest[3]=clip_table[((Y2 + Cb) >>13)];
 303                         dest[4]=clip_table[((Y2 + Cg) >>13)];
 304                         dest[5]=clip_table[((Y2 + Cr) >>13)];
 305                         dest+=6;
 306                 }
 307         }
 308         else if(dstFormat==IMGFMT_BGR16)
 309         {
 310                 int i;
 311 #ifdef DITHER1XBPP
 312                 static int ditherb1=1<<14;
 313                 static int ditherg1=1<<13;
 314                 static int ditherr1=2<<14;
 315                 static int ditherb2=3<<14;
 316                 static int ditherg2=3<<13;
 317                 static int ditherr2=0<<14;
 318
 319                 ditherb1 ^= (1^2)<<14;
 320                 ditherg1 ^= (1^2)<<13;
 321                 ditherr1 ^= (1^2)<<14;
 322                 ditherb2 ^= (3^0)<<14;
 323                 ditherg2 ^= (3^0)<<13;
 324                 ditherr2 ^= (3^0)<<14;
 325 #else
 326                 const int ditherb1=0;
 327                 const int ditherg1=0;
 328                 const int ditherr1=0;
 329                 const int ditherb2=0;
 330                 const int ditherg2=0;
 331                 const int ditherr2=0;
 332 #endif
 333                 for(i=0; i<(dstW>>1); i++){
 334                         int j;
 335                         int Y1=0;
 336                         int Y2=0;
 337                         int U=0;
 338                         int V=0;
 339                         int Cb, Cr, Cg;
 340                         for(j=0; j<lumFilterSize; j++)
 341                         {
 342                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 343                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 344                         }
 345                         for(j=0; j<chrFilterSize; j++)
 346                         {
 347                                 U += chrSrc[j][i] * chrFilter[j];
 348                                 V += chrSrc[j][i+2048] * chrFilter[j];
 349                         }
 350                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 351                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 352                         U >>= 19;
 353                         V >>= 19;
 354
 355                         Cb= clip_yuvtab_40cf[U+ 256];
 356                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 357                         Cr= clip_yuvtab_3343[V+ 256];
 358
 359                         ((uint16_t*)dest)[2*i] =
 360                                 clip_table16b[(Y1 + Cb + ditherb1) >>13] |
 361                                 clip_table16g[(Y1 + Cg + ditherg1) >>13] |
 362                                 clip_table16r[(Y1 + Cr + ditherr1) >>13];
 363
 364                         ((uint16_t*)dest)[2*i+1] =
 365                                 clip_table16b[(Y2 + Cb + ditherb2) >>13] |
 366                                 clip_table16g[(Y2 + Cg + ditherg2) >>13] |
 367                                 clip_table16r[(Y2 + Cr + ditherr2) >>13];
 368                 }
 369         }
 370         else if(dstFormat==IMGFMT_BGR15)
 371         {
 372                 int i;
 373 #ifdef DITHER1XBPP
 374                 static int ditherb1=1<<14;
 375                 static int ditherg1=1<<14;
 376                 static int ditherr1=2<<14;
 377                 static int ditherb2=3<<14;
 378                 static int ditherg2=3<<14;
 379                 static int ditherr2=0<<14;
 380
 381                 ditherb1 ^= (1^2)<<14;
 382                 ditherg1 ^= (1^2)<<14;
 383                 ditherr1 ^= (1^2)<<14;
 384                 ditherb2 ^= (3^0)<<14;
 385                 ditherg2 ^= (3^0)<<14;
 386                 ditherr2 ^= (3^0)<<14;
 387 #else
 388                 const int ditherb1=0;
 389                 const int ditherg1=0;
 390                 const int ditherr1=0;
 391                 const int ditherb2=0;
 392                 const int ditherg2=0;
 393                 const int ditherr2=0;
 394 #endif
 395                 for(i=0; i<(dstW>>1); i++){
 396                         int j;
 397                         int Y1=0;
 398                         int Y2=0;
 399                         int U=0;
 400                         int V=0;
 401                         int Cb, Cr, Cg;
 402                         for(j=0; j<lumFilterSize; j++)
 403                         {
 404                                 Y1 += lumSrc[j][2*i] * lumFilter[j];
 405                                 Y2 += lumSrc[j][2*i+1] * lumFilter[j];
 406                         }
 407                         for(j=0; j<chrFilterSize; j++)
 408                         {
 409                                 U += chrSrc[j][i] * chrFilter[j];
 410                                 V += chrSrc[j][i+2048] * chrFilter[j];
 411                         }
 412                         Y1= clip_yuvtab_2568[ (Y1>>19) + 256 ];
 413                         Y2= clip_yuvtab_2568[ (Y2>>19) + 256 ];
 414                         U >>= 19;
 415                         V >>= 19;
 416
 417                         Cb= clip_yuvtab_40cf[U+ 256];
 418                         Cg= clip_yuvtab_1a1e[V+ 256] + yuvtab_0c92[U+ 256];
 419                         Cr= clip_yuvtab_3343[V+ 256];
 420
 421                         ((uint16_t*)dest)[2*i] =
 422                                 clip_table15b[(Y1 + Cb + ditherb1) >>13] |
 423                                 clip_table15g[(Y1 + Cg + ditherg1) >>13] |
 424                                 clip_table15r[(Y1 + Cr + ditherr1) >>13];
 425
 426                         ((uint16_t*)dest)[2*i+1] =
 427                                 clip_table15b[(Y2 + Cb + ditherb2) >>13] |
 428                                 clip_table15g[(Y2 + Cg + ditherg2) >>13] |
 429                                 clip_table15r[(Y2 + Cr + ditherr2) >>13];
 430                 }
 431         }
 432 }
 433
 434
 435 //Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
 436 //Plain C versions
 437 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
 438 #define COMPILE_C
 439 #endif
 440
 441 #ifdef CAN_COMPILE_X86_ASM
 442
 443 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 444 #define COMPILE_MMX
 445 #endif
 446
 447 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
 448 #define COMPILE_MMX2
 449 #endif
 450
 451 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 452 #define COMPILE_3DNOW
 453 #endif
 454 #endif //CAN_COMPILE_X86_ASM
 455
 456 #undef HAVE_MMX
 457 #undef HAVE_MMX2
 458 #undef HAVE_3DNOW
 459
 460 #ifdef COMPILE_C
 461 #undef HAVE_MMX
 462 #undef HAVE_MMX2
 463 #undef HAVE_3DNOW
 464 #define RENAME(a) a ## _C
 465 #include "swscale_template.c"
 466 #endif
 467
 468 #ifdef CAN_COMPILE_X86_ASM
 469
 470 //X86 versions
 471 /*
 472 #undef RENAME
 473 #undef HAVE_MMX
 474 #undef HAVE_MMX2
 475 #undef HAVE_3DNOW
 476 #define ARCH_X86
 477 #define RENAME(a) a ## _X86
 478 #include "swscale_template.c"
 479 */
 480 //MMX versions
 481 #ifdef COMPILE_MMX
 482 #undef RENAME
 483 #define HAVE_MMX
 484 #undef HAVE_MMX2
 485 #undef HAVE_3DNOW
 486 #define RENAME(a) a ## _MMX
 487 #include "swscale_template.c"
 488 #endif
 489
 490 //MMX2 versions
 491 #ifdef COMPILE_MMX2
 492 #undef RENAME
 493 #define HAVE_MMX
 494 #define HAVE_MMX2
 495 #undef HAVE_3DNOW
 496 #define RENAME(a) a ## _MMX2
 497 #include "swscale_template.c"
 498 #endif
 499
 500 //3DNOW versions
 501 #ifdef COMPILE_3DNOW
 502 #undef RENAME
 503 #define HAVE_MMX
 504 #undef HAVE_MMX2
 505 #define HAVE_3DNOW
 506 #define RENAME(a) a ## _3DNow
 507 #include "swscale_template.c"
 508 #endif
 509
 510 #endif //CAN_COMPILE_X86_ASM
 511
 512 // minor note: the HAVE_xyz is messed up after that line so dont use it
 513
 514
 515 // old global scaler, dont use for new code
 516 // will use sws_flags from the command line
 517 void SwScale_YV12slice(unsigned char* src[], int srcStride[], int srcSliceY ,
 518                              int srcSliceH, uint8_t* dst[], int dstStride, int dstbpp,
 519                              int srcW, int srcH, int dstW, int dstH){
 520
 521         static SwsContext *context=NULL;
 522         int dstFormat;
 523         int dstStride3[3]= {dstStride, dstStride>>1, dstStride>>1};
 524
 525         switch(dstbpp)
 526         {
 527                 case 8 : dstFormat= IMGFMT_Y8;          break;
 528                 case 12: dstFormat= IMGFMT_YV12;        break;
 529                 case 15: dstFormat= IMGFMT_BGR15;       break;
 530                 case 16: dstFormat= IMGFMT_BGR16;       break;
 531                 case 24: dstFormat= IMGFMT_BGR24;       break;
 532                 case 32: dstFormat= IMGFMT_BGR32;       break;
 533                 default: return;
 534         }
 535
 536         if(!context) context=getSwsContextFromCmdLine(srcW, srcH, IMGFMT_YV12, dstW, dstH, dstFormat);
 537
 538         swScale(context, src, srcStride, srcSliceY, srcSliceH, dst, dstStride3);
 539 }
 540
 541 // will use sws_flags & src_filter (from cmd line)
 542 SwsContext *getSwsContextFromCmdLine(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat)
 543 {
 544         int flags=0;
 545         static int firstTime=1;
 546
 547 #ifdef ARCH_X86
 548         if(gCpuCaps.hasMMX)
 549                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
 550 #endif
 551         if(firstTime)
 552         {
 553                 firstTime=0;
 554                 flags= SWS_PRINT_INFO;
 555         }
 556         else if(verbose>1) flags= SWS_PRINT_INFO;
 557
 558         if(src_filter.lumH) freeVec(src_filter.lumH);
 559         if(src_filter.lumV) freeVec(src_filter.lumV);
 560         if(src_filter.chrH) freeVec(src_filter.chrH);
 561         if(src_filter.chrV) freeVec(src_filter.chrV);
 562
 563         if(sws_lum_gblur!=0.0){
 564                 src_filter.lumH= getGaussianVec(sws_lum_gblur, 3.0);
 565                 src_filter.lumV= getGaussianVec(sws_lum_gblur, 3.0);
 566         }else{
 567                 src_filter.lumH= getIdentityVec();
 568                 src_filter.lumV= getIdentityVec();
 569         }
 570
 571         if(sws_chr_gblur!=0.0){
 572                 src_filter.chrH= getGaussianVec(sws_chr_gblur, 3.0);
 573                 src_filter.chrV= getGaussianVec(sws_chr_gblur, 3.0);
 574         }else{
 575                 src_filter.chrH= getIdentityVec();
 576                 src_filter.chrV= getIdentityVec();
 577         }
 578
 579         if(sws_chr_sharpen!=0.0){
 580                 SwsVector *g= getConstVec(-1.0, 3);
 581                 SwsVector *id= getConstVec(10.0/sws_chr_sharpen, 1);
 582                 g->coeff[1]=2.0;
 583                 addVec(id, g);
 584                 convVec(src_filter.chrH, id);
 585                 convVec(src_filter.chrV, id);
 586                 freeVec(g);
 587                 freeVec(id);
 588         }
 589
 590         if(sws_lum_sharpen!=0.0){
 591                 SwsVector *g= getConstVec(-1.0, 3);
 592                 SwsVector *id= getConstVec(10.0/sws_lum_sharpen, 1);
 593                 g->coeff[1]=2.0;
 594                 addVec(id, g);
 595                 convVec(src_filter.lumH, id);
 596                 convVec(src_filter.lumV, id);
 597                 freeVec(g);
 598                 freeVec(id);
 599         }
 600
 601         if(sws_chr_hshift)
 602                 shiftVec(src_filter.chrH, sws_chr_hshift);
 603
 604         if(sws_chr_vshift)
 605                 shiftVec(src_filter.chrV, sws_chr_vshift);
 606
 607         normalizeVec(src_filter.chrH, 1.0);
 608         normalizeVec(src_filter.chrV, 1.0);
 609         normalizeVec(src_filter.lumH, 1.0);
 610         normalizeVec(src_filter.lumV, 1.0);
 611
 612         if(verbose > 1) printVec(src_filter.chrH);
 613         if(verbose > 1) printVec(src_filter.lumH);
 614
 615         switch(sws_flags)
 616         {
 617                 case 0: flags|= SWS_FAST_BILINEAR; break;
 618                 case 1: flags|= SWS_BILINEAR; break;
 619                 case 2: flags|= SWS_BICUBIC; break;
 620                 case 3: flags|= SWS_X; break;
 621                 case 4: flags|= SWS_POINT; break;
 622                 case 5: flags|= SWS_AREA; break;
 623                 default:flags|= SWS_BILINEAR; break;
 624         }
 625
 626         return getSwsContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, &src_filter, NULL);
 627 }
 628
 629
 630 static inline void initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
 631                               int srcW, int dstW, int filterAlign, int one, int flags,
 632                               SwsVector *srcFilter, SwsVector *dstFilter)
 633 {
 634         int i;
 635         int filterSize;
 636         int filter2Size;
 637         int minFilterSize;
 638         double *filter=NULL;
 639         double *filter2=NULL;
 640 #ifdef ARCH_X86
 641         if(gCpuCaps.hasMMX)
 642                 asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
 643 #endif
 644
 645         *filterPos = (int16_t*)memalign(8, (dstW+1)*sizeof(int16_t));
 646         (*filterPos)[dstW]=0; // the MMX scaler will read over the end
 647
 648         if(ABS(xInc - 0x10000) <10) // unscaled
 649         {
 650                 int i;
 651                 filterSize= 1;
 652                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 653                 for(i=0; i<dstW*filterSize; i++) filter[i]=0;
 654
 655                 for(i=0; i<dstW; i++)
 656                 {
 657                         filter[i*filterSize]=1;
 658                         (*filterPos)[i]=i;
 659                 }
 660
 661         }
 662         else if(flags&SWS_POINT) // lame looking point sampling mode
 663         {
 664                 int i;
 665                 int xDstInSrc;
 666                 filterSize= 1;
 667                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 668
 669                 xDstInSrc= xInc/2 - 0x8000;
 670                 for(i=0; i<dstW; i++)
 671                 {
 672                         int xx= (xDstInSrc>>16) - (filterSize>>1) + 1;
 673
 674                         (*filterPos)[i]= xx;
 675                         filter[i]= 1.0;
 676                         xDstInSrc+= xInc;
 677                 }
 678         }
 679         else if(xInc <= (1<<16) || (flags&SWS_FAST_BILINEAR)) // upscale
 680         {
 681                 int i;
 682                 int xDstInSrc;
 683                 if     (flags&SWS_BICUBIC) filterSize= 4;
 684                 else if(flags&SWS_X      ) filterSize= 4;
 685                 else                       filterSize= 2; // SWS_BILINEAR / SWS_AREA
 686 //              printf("%d %d %d\n", filterSize, srcW, dstW);
 687                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 688
 689                 xDstInSrc= xInc/2 - 0x8000;
 690                 for(i=0; i<dstW; i++)
 691                 {
 692                         int xx= (xDstInSrc>>16) - (filterSize>>1) + 1;
 693                         int j;
 694
 695                         (*filterPos)[i]= xx;
 696                         if((flags & SWS_BICUBIC) || (flags & SWS_X))
 697                         {
 698                                 double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
 699                                 double y1,y2,y3,y4;
 700                                 double A= -0.6;
 701                                 if(flags & SWS_BICUBIC){
 702                                                 // Equation is from VirtualDub
 703                                         y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
 704                                         y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
 705                                         y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
 706                                         y4 = (                  +           A*d*d -       A*d*d*d);
 707                                 }else{
 708                                                 // cubic interpolation (derived it myself)
 709                                         y1 = (    -2.0*d + 3.0*d*d - 1.0*d*d*d)/6.0;
 710                                         y2 = (6.0 -3.0*d - 6.0*d*d + 3.0*d*d*d)/6.0;
 711                                         y3 = (    +6.0*d + 3.0*d*d - 3.0*d*d*d)/6.0;
 712                                         y4 = (    -1.0*d           + 1.0*d*d*d)/6.0;
 713                                 }
 714
 715 //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
 716                                 filter[i*filterSize + 0]= y1;
 717                                 filter[i*filterSize + 1]= y2;
 718                                 filter[i*filterSize + 2]= y3;
 719                                 filter[i*filterSize + 3]= y4;
 720 //                              printf("%1.3f %1.3f %1.3f %1.3f %1.3f\n",d , y1, y2, y3, y4);
 721                         }
 722                         else
 723                         {
 724                                 //Bilinear upscale / linear interpolate / Area averaging
 725                                 for(j=0; j<filterSize; j++)
 726                                 {
 727                                         double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
 728                                         double coeff= 1.0 - d;
 729                                         if(coeff<0) coeff=0;
 730         //                              printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
 731                                         filter[i*filterSize + j]= coeff;
 732                                         xx++;
 733                                 }
 734                         }
 735                         xDstInSrc+= xInc;
 736                 }
 737         }
 738         else // downscale
 739         {
 740                 int xDstInSrc;
 741                 if(flags&SWS_BICUBIC)   filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
 742                 else if(flags&SWS_X)    filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
 743                 else if(flags&SWS_AREA) filterSize= (int)ceil(1 + 1.0*srcW / (double)dstW);
 744                 else /* BILINEAR */     filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
 745 //              printf("%d %d %d\n", *filterSize, srcW, dstW);
 746                 filter= (double*)memalign(8, dstW*sizeof(double)*filterSize);
 747
 748                 xDstInSrc= xInc/2 - 0x8000;
 749                 for(i=0; i<dstW; i++)
 750                 {
 751                         int xx= (int)((double)xDstInSrc/(double)(1<<16) - (filterSize-1)*0.5 + 0.5);
 752                         int j;
 753                         (*filterPos)[i]= xx;
 754                         for(j=0; j<filterSize; j++)
 755                         {
 756                                 double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
 757                                 double coeff;
 758                                 if((flags & SWS_BICUBIC) || (flags & SWS_X))
 759                                 {
 760                                         double A= -0.75;
 761 //                                      d*=2;
 762                                         // Equation is from VirtualDub
 763                                         if(d<1.0)
 764                                                 coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
 765                                         else if(d<2.0)
 766                                                 coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
 767                                         else
 768                                                 coeff=0.0;
 769                                 }
 770                                 else if(flags & SWS_AREA)
 771                                 {
 772                                         double srcPixelSize= (1<<16)/(double)xInc;
 773                                         if(d + srcPixelSize/2 < 0.5) coeff= 1.0;
 774                                         else if(d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
 775                                         else coeff=0.0;
 776                                 }
 777                                 else
 778                                 {
 779                                         coeff= 1.0 - d;
 780                                         if(coeff<0) coeff=0;
 781                                 }
 782 //                              printf("%1.3f %2.3f %d \n", coeff, d, xDstInSrc);
 783                                 filter[i*filterSize + j]= coeff;
 784                                 xx++;
 785                         }
 786                         xDstInSrc+= xInc;
 787                 }
 788         }
 789
 790         /* apply src & dst Filter to filter -> filter2
 791            free(filter);
 792         */
 793         filter2Size= filterSize;
 794         if(srcFilter) filter2Size+= srcFilter->length - 1;
 795         if(dstFilter) filter2Size+= dstFilter->length - 1;
 796         filter2= (double*)memalign(8, filter2Size*dstW*sizeof(double));
 797
 798         for(i=0; i<dstW; i++)
 799         {
 800                 int j;
 801                 SwsVector scaleFilter;
 802                 SwsVector *outVec;
 803
 804                 scaleFilter.coeff= filter + i*filterSize;
 805                 scaleFilter.length= filterSize;
 806
 807                 if(srcFilter) outVec= getConvVec(srcFilter, &scaleFilter);
 808                 else          outVec= &scaleFilter;
 809
 810                 ASSERT(outVec->length == filter2Size)
 811                 //FIXME dstFilter
 812
 813                 for(j=0; j<outVec->length; j++)
 814                 {
 815                         filter2[i*filter2Size + j]= outVec->coeff[j];
 816                 }
 817
 818                 (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
 819
 820                 if(outVec != &scaleFilter) freeVec(outVec);
 821         }
 822         free(filter); filter=NULL;
 823
 824         /* try to reduce the filter-size (step1 find size and shift left) */
 825         // Assume its near normalized (*0.5 or *2.0 is ok but * 0.001 is not)
 826         minFilterSize= 0;
 827         for(i=dstW-1; i>=0; i--)
 828         {
 829                 int min= filter2Size;
 830                 int j;
 831                 double cutOff=0.0;
 832
 833                 /* get rid off near zero elements on the left by shifting left */
 834                 for(j=0; j<filter2Size; j++)
 835                 {
 836                         int k;
 837                         cutOff += ABS(filter2[i*filter2Size]);
 838
 839                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
 840
 841                         /* preserve Monotonicity because the core cant handle the filter otherwise */
 842                         if(i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
 843
 844                         // Move filter coeffs left
 845                         for(k=1; k<filter2Size; k++)
 846                                 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
 847                         filter2[i*filter2Size + k - 1]= 0.0;
 848                         (*filterPos)[i]++;
 849                 }
 850
 851                 cutOff=0.0;
 852                 /* count near zeros on the right */
 853                 for(j=filter2Size-1; j>0; j--)
 854                 {
 855                         cutOff += ABS(filter2[i*filter2Size + j]);
 856
 857                         if(cutOff > SWS_MAX_REDUCE_CUTOFF) break;
 858                         min--;
 859                 }
 860
 861                 if(min>minFilterSize) minFilterSize= min;
 862         }
 863
 864         filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
 865         filter= (double*)memalign(8, filterSize*dstW*sizeof(double));
 866         *outFilterSize= filterSize;
 867
 868         if((flags&SWS_PRINT_INFO) && verbose)
 869                 printf("SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
 870         /* try to reduce the filter-size (step2 reduce it) */
 871         for(i=0; i<dstW; i++)
 872         {
 873                 int j;
 874
 875                 for(j=0; j<filterSize; j++)
 876                 {
 877                         if(j>=filter2Size) filter[i*filterSize + j]= 0.0;
 878                         else               filter[i*filterSize + j]= filter2[i*filter2Size + j];
 879                 }
 880         }
 881         free(filter2); filter2=NULL;
 882
 883         ASSERT(filterSize > 0)
 884
 885         //FIXME try to align filterpos if possible
 886
 887         //fix borders
 888         for(i=0; i<dstW; i++)
 889         {
 890                 int j;
 891                 if((*filterPos)[i] < 0)
 892                 {
 893                         // Move filter coeffs left to compensate for filterPos
 894                         for(j=1; j<filterSize; j++)
 895                         {
 896                                 int left= MAX(j + (*filterPos)[i], 0);
 897                                 filter[i*filterSize + left] += filter[i*filterSize + j];
 898                                 filter[i*filterSize + j]=0;
 899                         }
 900                         (*filterPos)[i]= 0;
 901                 }
 902
 903                 if((*filterPos)[i] + filterSize > srcW)
 904                 {
 905                         int shift= (*filterPos)[i] + filterSize - srcW;
 906                         // Move filter coeffs right to compensate for filterPos
 907                         for(j=filterSize-2; j>=0; j--)
 908                         {
 909                                 int right= MIN(j + shift, filterSize-1);
 910                                 filter[i*filterSize +right] += filter[i*filterSize +j];
 911                                 filter[i*filterSize +j]=0;
 912                         }
 913                         (*filterPos)[i]= srcW - filterSize;
 914                 }
 915         }
 916
 917         // Note the +1 is for the MMXscaler which reads over the end
 918         *outFilter= (int16_t*)memalign(8, *outFilterSize*(dstW+1)*sizeof(int16_t));
 919         memset(*outFilter, 0, *outFilterSize*(dstW+1)*sizeof(int16_t));
 920
 921         /* Normalize & Store in outFilter */
 922         for(i=0; i<dstW; i++)
 923         {
 924                 int j;
 925                 double sum=0;
 926                 double scale= one;
 927                 for(j=0; j<filterSize; j++)
 928                 {
 929                         sum+= filter[i*filterSize + j];
 930                 }
 931                 scale/= sum;
 932                 for(j=0; j<filterSize; j++)
 933                 {
 934                         (*outFilter)[i*(*outFilterSize) + j]= (int)(filter[i*filterSize + j]*scale);
 935                 }
 936         }
 937
 938         free(filter);
 939 }
 940
 941 #ifdef ARCH_X86
 942 static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
 943 {
 944         uint8_t *fragment;
 945         int imm8OfPShufW1;
 946         int imm8OfPShufW2;
 947         int fragmentLength;
 948
 949         int xpos, i;
 950
 951         // create an optimized horizontal scaling routine
 952
 953         //code fragment
 954
 955         asm volatile(
 956                 "jmp 9f                         \n\t"
 957         // Begin
 958                 "0:                             \n\t"
 959                 "movq (%%esi), %%mm0            \n\t" //FIXME Alignment
 960                 "movq %%mm0, %%mm1              \n\t"
 961                 "psrlq $8, %%mm0                \n\t"
 962                 "punpcklbw %%mm7, %%mm1 \n\t"
 963                 "movq %%mm2, %%mm3              \n\t"
 964                 "punpcklbw %%mm7, %%mm0 \n\t"
 965                 "addw %%bx, %%cx                \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
 966                 "pshufw $0xFF, %%mm1, %%mm1     \n\t"
 967                 "1:                             \n\t"
 968                 "adcl %%edx, %%esi              \n\t" //xx+= (4*lumXInc)>>16 + carry
 969                 "pshufw $0xFF, %%mm0, %%mm0     \n\t"
 970                 "2:                             \n\t"
 971                 "psrlw $9, %%mm3                \n\t"
 972                 "psubw %%mm1, %%mm0             \n\t"
 973                 "pmullw %%mm3, %%mm0            \n\t"
 974                 "paddw %%mm6, %%mm2             \n\t" // 2*alpha += xpos&0xFFFF
 975                 "psllw $7, %%mm1                \n\t"
 976                 "paddw %%mm1, %%mm0             \n\t"
 977
 978                 "movq %%mm0, (%%edi, %%eax)     \n\t"
 979
 980                 "addl $8, %%eax                 \n\t"
 981         // End
 982                 "9:                             \n\t"
 983 //              "int $3\n\t"
 984                 "leal 0b, %0                    \n\t"
 985                 "leal 1b, %1                    \n\t"
 986                 "leal 2b, %2                    \n\t"
 987                 "decl %1                        \n\t"
 988                 "decl %2                        \n\t"
 989                 "subl %0, %1                    \n\t"
 990                 "subl %0, %2                    \n\t"
 991                 "leal 9b, %3                    \n\t"
 992                 "subl %0, %3                    \n\t"
 993                 :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
 994                 "=r" (fragmentLength)
 995         );
 996
 997         xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
 998
 999         for(i=0; i<dstW/8; i++)
1000         {
1001                 int xx=xpos>>16;
1002
1003                 if((i&3) == 0)
1004                 {
1005                         int a=0;
1006                         int b=((xpos+xInc)>>16) - xx;
1007                         int c=((xpos+xInc*2)>>16) - xx;
1008                         int d=((xpos+xInc*3)>>16) - xx;
1009
1010                         memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
1011
1012                         funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
1013                         funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
1014                                 a | (b<<2) | (c<<4) | (d<<6);
1015
1016                         // if we dont need to read 8 bytes than dont :), reduces the chance of
1017                         // crossing a cache line
1018                         if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
1019
1020                         funnyCode[fragmentLength*(i+4)/4]= RET;
1021                 }
1022                 xpos+=xInc;
1023         }
1024 }
1025 #endif // ARCH_X86
1026
1027 //FIXME remove
1028 void SwScale_Init(){
1029 }
1030
1031 static void globalInit(){
1032     // generating tables:
1033     int i;
1034     for(i=0; i<768; i++){
1035         int c= MIN(MAX(i-256, 0), 255);
1036         clip_table[i]=c;
1037         yuvtab_2568[c]= clip_yuvtab_2568[i]=(0x2568*(c-16))+(256<<13);
1038         yuvtab_3343[c]= clip_yuvtab_3343[i]=0x3343*(c-128);
1039         yuvtab_0c92[c]= clip_yuvtab_0c92[i]=-0x0c92*(c-128);
1040         yuvtab_1a1e[c]= clip_yuvtab_1a1e[i]=-0x1a1e*(c-128);
1041         yuvtab_40cf[c]= clip_yuvtab_40cf[i]=0x40cf*(c-128);
1042     }
1043
1044     for(i=0; i<768; i++)
1045     {
1046         int v= clip_table[i];
1047         clip_table16b[i]= v>>3;
1048         clip_table16g[i]= (v<<3)&0x07E0;
1049         clip_table16r[i]= (v<<8)&0xF800;
1050         clip_table15b[i]= v>>3;
1051         clip_table15g[i]= (v<<2)&0x03E0;
1052         clip_table15r[i]= (v<<7)&0x7C00;
1053     }
1054
1055 cpuCaps= gCpuCaps;
1056
1057 #ifdef RUNTIME_CPUDETECT
1058 #ifdef CAN_COMPILE_X86_ASM
1059         // ordered per speed fasterst first
1060         if(gCpuCaps.hasMMX2)
1061                 swScale= swScale_MMX2;
1062         else if(gCpuCaps.has3DNow)
1063                 swScale= swScale_3DNow;
1064         else if(gCpuCaps.hasMMX)
1065                 swScale= swScale_MMX;
1066         else
1067                 swScale= swScale_C;
1068
1069 #else
1070         swScale= swScale_C;
1071         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1072 #endif
1073 #else //RUNTIME_CPUDETECT
1074 #ifdef HAVE_MMX2
1075         swScale= swScale_MMX2;
1076         cpuCaps.has3DNow = 0;
1077 #elif defined (HAVE_3DNOW)
1078         swScale= swScale_3DNow;
1079         cpuCaps.hasMMX2 = 0;
1080 #elif defined (HAVE_MMX)
1081         swScale= swScale_MMX;
1082         cpuCaps.hasMMX2 = cpuCaps.has3DNow = 0;
1083 #else
1084         swScale= swScale_C;
1085         cpuCaps.hasMMX2 = cpuCaps.hasMMX = cpuCaps.has3DNow = 0;
1086 #endif
1087 #endif //!RUNTIME_CPUDETECT
1088 }
1089
1090
1091 SwsContext *getSwsContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
1092                          SwsFilter *srcFilter, SwsFilter *dstFilter){
1093
1094         SwsContext *c;
1095         int i;
1096         SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
1097
1098 #ifdef ARCH_X86
1099         if(gCpuCaps.hasMMX)
1100                 asm volatile("emms\n\t"::: "memory");
1101 #endif
1102
1103         if(swScale==NULL) globalInit();
1104
1105         /* sanity check */
1106         if(srcW<4 || srcH<1 || dstW<8 || dstH<1) return NULL; //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
1107
1108         if(srcFormat!=IMGFMT_YV12 && srcFormat!=IMGFMT_I420 && srcFormat!=IMGFMT_IYUV) return NULL;
1109
1110         if(!dstFilter) dstFilter= &dummyFilter;
1111         if(!srcFilter) srcFilter= &dummyFilter;
1112
1113         c= memalign(64, sizeof(SwsContext));
1114         memset(c, 0, sizeof(SwsContext));
1115
1116         c->srcW= srcW;
1117         c->srcH= srcH;
1118         c->dstW= dstW;
1119         c->dstH= dstH;
1120         c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
1121         c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
1122         c->flags= flags;
1123         c->dstFormat= dstFormat;
1124         c->srcFormat= srcFormat;
1125
1126         if(cpuCaps.hasMMX2)
1127         {
1128                 c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
1129                 if(!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
1130                 {
1131                         if(flags&SWS_PRINT_INFO)
1132                                 fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
1133                 }
1134         }
1135         else
1136                 c->canMMX2BeUsed=0;
1137
1138         // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1139         // but only for the FAST_BILINEAR mode otherwise do correct scaling
1140         // n-2 is the last chrominance sample available
1141         // this is not perfect, but noone shuld notice the difference, the more correct variant
1142         // would be like the vertical one, but that would require some special code for the
1143         // first and last pixel
1144         if(flags&SWS_FAST_BILINEAR)
1145         {
1146                 if(c->canMMX2BeUsed)    c->lumXInc+= 20;
1147                 //we dont use the x86asm scaler if mmx is available
1148                 else if(cpuCaps.hasMMX) c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
1149         }
1150
1151         /* set chrXInc & chrDstW */
1152         if((flags&SWS_FULL_UV_IPOL) && !isHalfChrH(dstFormat))
1153                 c->chrXInc= c->lumXInc>>1, c->chrDstW= dstW;
1154         else
1155                 c->chrXInc= c->lumXInc,    c->chrDstW= (dstW+1)>>1;
1156
1157         /* set chrYInc & chrDstH */
1158         if(isHalfChrV(dstFormat))
1159                 c->chrYInc= c->lumYInc,    c->chrDstH= (dstH+1)>>1;
1160         else    c->chrYInc= c->lumYInc>>1, c->chrDstH= dstH;
1161
1162         /* precalculate horizontal scaler filter coefficients */
1163         {
1164                 const int filterAlign= cpuCaps.hasMMX ? 4 : 1;
1165
1166                 initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
1167                                  srcW      ,       dstW, filterAlign, 1<<14, flags,
1168                                  srcFilter->lumH, dstFilter->lumH);
1169                 initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
1170                                 (srcW+1)>>1, c->chrDstW, filterAlign, 1<<14, flags,
1171                                  srcFilter->chrH, dstFilter->chrH);
1172
1173 #ifdef ARCH_X86
1174 // cant downscale !!!
1175                 if(c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR))
1176                 {
1177                         initMMX2HScaler(      dstW, c->lumXInc, c->funnyYCode);
1178                         initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode);
1179                 }
1180 #endif
1181         } // Init Horizontal stuff
1182
1183
1184
1185         /* precalculate vertical scaler filter coefficients */
1186         initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
1187                         srcH      ,        dstH, 1, (1<<12)-4, flags,
1188                         srcFilter->lumV, dstFilter->lumV);
1189         initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
1190                         (srcH+1)>>1, c->chrDstH, 1, (1<<12)-4, flags,
1191                          srcFilter->chrV, dstFilter->chrV);
1192
1193         // Calculate Buffer Sizes so that they wont run out while handling these damn slices
1194         c->vLumBufSize= c->vLumFilterSize;
1195         c->vChrBufSize= c->vChrFilterSize;
1196         for(i=0; i<dstH; i++)
1197         {
1198                 int chrI= i*c->chrDstH / dstH;
1199                 int nextSlice= MAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
1200                                  ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<1));
1201                 nextSlice&= ~1; // Slices start at even boundaries
1202                 if(c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
1203                         c->vLumBufSize= nextSlice - c->vLumFilterPos[i   ];
1204                 if(c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>1))
1205                         c->vChrBufSize= (nextSlice>>1) - c->vChrFilterPos[chrI];
1206         }
1207
1208         // allocate pixbufs (we use dynamic allocation because otherwise we would need to
1209         c->lumPixBuf= (int16_t**)memalign(4, c->vLumBufSize*2*sizeof(int16_t*));
1210         c->chrPixBuf= (int16_t**)memalign(4, c->vChrBufSize*2*sizeof(int16_t*));
1211         //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
1212         for(i=0; i<c->vLumBufSize; i++)
1213                 c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= (uint16_t*)memalign(8, 4000);
1214         for(i=0; i<c->vChrBufSize; i++)
1215                 c->chrPixBuf[i]= c->chrPixBuf[i+c->vChrBufSize]= (uint16_t*)memalign(8, 8000);
1216
1217         //try to avoid drawing green stuff between the right end and the stride end
1218         for(i=0; i<c->vLumBufSize; i++) memset(c->lumPixBuf[i], 0, 4000);
1219         for(i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, 8000);
1220
1221         ASSERT(c->chrDstH <= dstH)
1222
1223         // pack filter data for mmx code
1224         if(cpuCaps.hasMMX)
1225         {
1226                 c->lumMmxFilter= (int16_t*)memalign(8, c->vLumFilterSize*      dstH*4*sizeof(int16_t));
1227                 c->chrMmxFilter= (int16_t*)memalign(8, c->vChrFilterSize*c->chrDstH*4*sizeof(int16_t));
1228                 for(i=0; i<c->vLumFilterSize*dstH; i++)
1229                         c->lumMmxFilter[4*i]=c->lumMmxFilter[4*i+1]=c->lumMmxFilter[4*i+2]=c->lumMmxFilter[4*i+3]=
1230                                 c->vLumFilter[i];
1231                 for(i=0; i<c->vChrFilterSize*c->chrDstH; i++)
1232                         c->chrMmxFilter[4*i]=c->chrMmxFilter[4*i+1]=c->chrMmxFilter[4*i+2]=c->chrMmxFilter[4*i+3]=
1233                                 c->vChrFilter[i];
1234         }
1235
1236         if(flags&SWS_PRINT_INFO)
1237         {
1238 #ifdef DITHER1XBPP
1239                 char *dither= " dithered";
1240 #else
1241                 char *dither= "";
1242 #endif
1243                 if(flags&SWS_FAST_BILINEAR)
1244                         fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
1245                 else if(flags&SWS_BILINEAR)
1246                         fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
1247                 else if(flags&SWS_BICUBIC)
1248                         fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
1249                 else if(flags&SWS_POINT)
1250                         fprintf(stderr, "\nSwScaler: Nearest Neighbor / POINT scaler ");
1251                 else if(flags&SWS_AREA)
1252                         fprintf(stderr, "\nSwScaler: Area Averageing scaler ");
1253                 else
1254                         fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
1255
1256                 if(dstFormat==IMGFMT_BGR15)
1257                         fprintf(stderr, "with%s BGR15 output ", dither);
1258                 else if(dstFormat==IMGFMT_BGR16)
1259                         fprintf(stderr, "with%s BGR16 output ", dither);
1260                 else if(dstFormat==IMGFMT_BGR24)
1261                         fprintf(stderr, "with BGR24 output ");
1262                 else if(dstFormat==IMGFMT_BGR32)
1263                         fprintf(stderr, "with BGR32 output ");
1264                 else if(dstFormat==IMGFMT_YV12)
1265                         fprintf(stderr, "with YV12 output ");
1266                 else if(dstFormat==IMGFMT_I420)
1267                         fprintf(stderr, "with I420 output ");
1268                 else if(dstFormat==IMGFMT_IYUV)
1269                         fprintf(stderr, "with IYUV output ");
1270                 else
1271                         fprintf(stderr, "without output ");
1272
1273                 if(cpuCaps.hasMMX2)
1274                         fprintf(stderr, "using MMX2\n");
1275                 else if(cpuCaps.has3DNow)
1276                         fprintf(stderr, "using 3DNOW\n");
1277                 else if(cpuCaps.hasMMX)
1278                         fprintf(stderr, "using MMX\n");
1279                 else
1280                         fprintf(stderr, "using C\n");
1281         }
1282
1283         if((flags & SWS_PRINT_INFO) && verbose)
1284         {
1285                 if(cpuCaps.hasMMX)
1286                 {
1287                         if(c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1288                                 printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1289                         else
1290                         {
1291                                 if(c->hLumFilterSize==4)
1292                                         printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
1293                                 else if(c->hLumFilterSize==8)
1294                                         printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
1295                                 else
1296                                         printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
1297
1298                                 if(c->hChrFilterSize==4)
1299                                         printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
1300                                 else if(c->hChrFilterSize==8)
1301                                         printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
1302                                 else
1303                                         printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
1304                         }
1305                 }
1306                 else
1307                 {
1308 #ifdef ARCH_X86
1309                         printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
1310 #else
1311                         if(flags & SWS_FAST_BILINEAR)
1312                                 printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
1313                         else
1314                                 printf("SwScaler: using C scaler for horizontal scaling\n");
1315 #endif
1316                 }
1317                 if(isPlanarYUV(dstFormat))
1318                 {
1319                         if(c->vLumFilterSize==1)
1320                                 printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1321                         else
1322                                 printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12 like)\n", cpuCaps.hasMMX ? "MMX" : "C");
1323                 }
1324                 else
1325                 {
1326                         if(c->vLumFilterSize==1 && c->vChrFilterSize==2)
1327                                 printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1328                                        "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n",cpuCaps.hasMMX ? "MMX" : "C");
1329                         else if(c->vLumFilterSize==2 && c->vChrFilterSize==2)
1330                                 printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1331                         else
1332                                 printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", cpuCaps.hasMMX ? "MMX" : "C");
1333                 }
1334
1335                 if(dstFormat==IMGFMT_BGR24)
1336                         printf("SwScaler: using %s YV12->BGR24 Converter\n",
1337                                 cpuCaps.hasMMX2 ? "MMX2" : (cpuCaps.hasMMX ? "MMX" : "C"));
1338                 else if(dstFormat==IMGFMT_BGR32)
1339                         printf("SwScaler: using %s YV12->BGR32 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1340                 else if(dstFormat==IMGFMT_BGR16)
1341                         printf("SwScaler: using %s YV12->BGR16 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1342                 else if(dstFormat==IMGFMT_BGR15)
1343                         printf("SwScaler: using %s YV12->BGR15 Converter\n", cpuCaps.hasMMX ? "MMX" : "C");
1344
1345                 printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1346         }
1347
1348         return c;
1349 }
1350
1351 /**
1352  * returns a normalized gaussian curve used to filter stuff
1353  * quality=3 is high quality, lowwer is lowwer quality
1354  */
1355
1356 SwsVector *getGaussianVec(double variance, double quality){
1357         const int length= (int)(variance*quality + 0.5) | 1;
1358         int i;
1359         double *coeff= memalign(sizeof(double), length*sizeof(double));
1360         double middle= (length-1)*0.5;
1361         SwsVector *vec= malloc(sizeof(SwsVector));
1362
1363         vec->coeff= coeff;
1364         vec->length= length;
1365
1366         for(i=0; i<length; i++)
1367         {
1368                 double dist= i-middle;
1369                 coeff[i]= exp( -dist*dist/(2*variance*variance) ) / sqrt(2*variance*PI);
1370         }
1371
1372         normalizeVec(vec, 1.0);
1373
1374         return vec;
1375 }
1376
1377 SwsVector *getConstVec(double c, int length){
1378         int i;
1379         double *coeff= memalign(sizeof(double), length*sizeof(double));
1380         SwsVector *vec= malloc(sizeof(SwsVector));
1381
1382         vec->coeff= coeff;
1383         vec->length= length;
1384
1385         for(i=0; i<length; i++)
1386                 coeff[i]= c;
1387
1388         return vec;
1389 }
1390
1391
1392 SwsVector *getIdentityVec(void){
1393         double *coeff= memalign(sizeof(double), sizeof(double));
1394         SwsVector *vec= malloc(sizeof(SwsVector));
1395         coeff[0]= 1.0;
1396
1397         vec->coeff= coeff;
1398         vec->length= 1;
1399
1400         return vec;
1401 }
1402
1403 void normalizeVec(SwsVector *a, double height){
1404         int i;
1405         double sum=0;
1406         double inv;
1407
1408         for(i=0; i<a->length; i++)
1409                 sum+= a->coeff[i];
1410
1411         inv= height/sum;
1412
1413         for(i=0; i<a->length; i++)
1414                 a->coeff[i]*= height;
1415 }
1416
1417 void scaleVec(SwsVector *a, double scalar){
1418         int i;
1419
1420         for(i=0; i<a->length; i++)
1421                 a->coeff[i]*= scalar;
1422 }
1423
1424 static SwsVector *getConvVec(SwsVector *a, SwsVector *b){
1425         int length= a->length + b->length - 1;
1426         double *coeff= memalign(sizeof(double), length*sizeof(double));
1427         int i, j;
1428         SwsVector *vec= malloc(sizeof(SwsVector));
1429
1430         vec->coeff= coeff;
1431         vec->length= length;
1432
1433         for(i=0; i<length; i++) coeff[i]= 0.0;
1434
1435         for(i=0; i<a->length; i++)
1436         {
1437                 for(j=0; j<b->length; j++)
1438                 {
1439                         coeff[i+j]+= a->coeff[i]*b->coeff[j];
1440                 }
1441         }
1442
1443         return vec;
1444 }
1445
1446 static SwsVector *sumVec(SwsVector *a, SwsVector *b){
1447         int length= MAX(a->length, b->length);
1448         double *coeff= memalign(sizeof(double), length*sizeof(double));
1449         int i;
1450         SwsVector *vec= malloc(sizeof(SwsVector));
1451
1452         vec->coeff= coeff;
1453         vec->length= length;
1454
1455         for(i=0; i<length; i++) coeff[i]= 0.0;
1456
1457         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1458         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
1459
1460         return vec;
1461 }
1462
1463 static SwsVector *diffVec(SwsVector *a, SwsVector *b){
1464         int length= MAX(a->length, b->length);
1465         double *coeff= memalign(sizeof(double), length*sizeof(double));
1466         int i;
1467         SwsVector *vec= malloc(sizeof(SwsVector));
1468
1469         vec->coeff= coeff;
1470         vec->length= length;
1471
1472         for(i=0; i<length; i++) coeff[i]= 0.0;
1473
1474         for(i=0; i<a->length; i++) coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
1475         for(i=0; i<b->length; i++) coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
1476
1477         return vec;
1478 }
1479
1480 /* shift left / or right if "shift" is negative */
1481 static SwsVector *getShiftedVec(SwsVector *a, int shift){
1482         int length= a->length + ABS(shift)*2;
1483         double *coeff= memalign(sizeof(double), length*sizeof(double));
1484         int i;
1485         SwsVector *vec= malloc(sizeof(SwsVector));
1486
1487         vec->coeff= coeff;
1488         vec->length= length;
1489
1490         for(i=0; i<length; i++) coeff[i]= 0.0;
1491
1492         for(i=0; i<a->length; i++)
1493         {
1494                 coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
1495         }
1496
1497         return vec;
1498 }
1499
1500 void shiftVec(SwsVector *a, int shift){
1501         SwsVector *shifted= getShiftedVec(a, shift);
1502         free(a->coeff);
1503         a->coeff= shifted->coeff;
1504         a->length= shifted->length;
1505         free(shifted);
1506 }
1507
1508 void addVec(SwsVector *a, SwsVector *b){
1509         SwsVector *sum= sumVec(a, b);
1510         free(a->coeff);
1511         a->coeff= sum->coeff;
1512         a->length= sum->length;
1513         free(sum);
1514 }
1515
1516 void subVec(SwsVector *a, SwsVector *b){
1517         SwsVector *diff= diffVec(a, b);
1518         free(a->coeff);
1519         a->coeff= diff->coeff;
1520         a->length= diff->length;
1521         free(diff);
1522 }
1523
1524 void convVec(SwsVector *a, SwsVector *b){
1525         SwsVector *conv= getConvVec(a, b);
1526         free(a->coeff);
1527         a->coeff= conv->coeff;
1528         a->length= conv->length;
1529         free(conv);
1530 }
1531
1532 SwsVector *cloneVec(SwsVector *a){
1533         double *coeff= memalign(sizeof(double), a->length*sizeof(double));
1534         int i;
1535         SwsVector *vec= malloc(sizeof(SwsVector));
1536
1537         vec->coeff= coeff;
1538         vec->length= a->length;
1539
1540         for(i=0; i<a->length; i++) coeff[i]= a->coeff[i];
1541
1542         return vec;
1543 }
1544
1545 void printVec(SwsVector *a){
1546         int i;
1547         double max=0;
1548         double min=0;
1549         double range;
1550
1551         for(i=0; i<a->length; i++)
1552                 if(a->coeff[i]>max) max= a->coeff[i];
1553
1554         for(i=0; i<a->length; i++)
1555                 if(a->coeff[i]<min) min= a->coeff[i];
1556
1557         range= max - min;
1558
1559         for(i=0; i<a->length; i++)
1560         {
1561                 int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
1562                 printf("%1.3f ", a->coeff[i]);
1563                 for(;x>0; x--) printf(" ");
1564                 printf("|\n");
1565         }
1566 }
1567
1568 void freeVec(SwsVector *a){
1569         if(!a) return;
1570         if(a->coeff) free(a->coeff);
1571         a->coeff=NULL;
1572         a->length=0;
1573         free(a);
1574 }
1575
1576 void freeSwsContext(SwsContext *c){
1577         int i;
1578
1579         if(!c) return;
1580
1581         if(c->lumPixBuf)
1582         {
1583                 for(i=0; i<c->vLumBufSize; i++)
1584                 {
1585                         if(c->lumPixBuf[i]) free(c->lumPixBuf[i]);
1586                         c->lumPixBuf[i]=NULL;
1587                 }
1588                 free(c->lumPixBuf);
1589                 c->lumPixBuf=NULL;
1590         }
1591
1592         if(c->chrPixBuf)
1593         {
1594                 for(i=0; i<c->vChrBufSize; i++)
1595                 {
1596                         if(c->chrPixBuf[i]) free(c->chrPixBuf[i]);
1597                         c->chrPixBuf[i]=NULL;
1598                 }
1599                 free(c->chrPixBuf);
1600                 c->chrPixBuf=NULL;
1601         }
1602
1603         if(c->vLumFilter) free(c->vLumFilter);
1604         c->vLumFilter = NULL;
1605         if(c->vChrFilter) free(c->vChrFilter);
1606         c->vChrFilter = NULL;
1607         if(c->hLumFilter) free(c->hLumFilter);
1608         c->hLumFilter = NULL;
1609         if(c->hChrFilter) free(c->hChrFilter);
1610         c->hChrFilter = NULL;
1611
1612         if(c->vLumFilterPos) free(c->vLumFilterPos);
1613         c->vLumFilterPos = NULL;
1614         if(c->vChrFilterPos) free(c->vChrFilterPos);
1615         c->vChrFilterPos = NULL;
1616         if(c->hLumFilterPos) free(c->hLumFilterPos);
1617         c->hLumFilterPos = NULL;
1618         if(c->hChrFilterPos) free(c->hChrFilterPos);
1619         c->hChrFilterPos = NULL;
1620
1621         if(c->lumMmxFilter) free(c->lumMmxFilter);
1622         c->lumMmxFilter = NULL;
1623         if(c->chrMmxFilter) free(c->chrMmxFilter);
1624         c->chrMmxFilter = NULL;
1625
1626         free(c);
1627 }
1628
1629