git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  * the C code (not assembly, mmx, ...) of this file can be used
  21  * under the LGPL license too
  22  */
  23
  24 /*
  25   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  26   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  27   {BGR,RGB}{1,4,8,15,16} support dithering
  28
  29   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  30   YV12 -> {BGR,RGB}{1,4,8,15,16,24,32}
  31   x -> x
  32   YUV9 -> YV12
  33   YUV9/YV12 -> Y800
  34   Y800 -> YUV9/YV12
  35   BGR24 -> BGR32 & RGB24 -> RGB32
  36   BGR32 -> BGR24 & RGB32 -> RGB24
  37   BGR15 -> BGR16
  38 */
  39
  40 /*
  41 tested special converters (most are tested actually, but I did not write it down ...)
  42  YV12 -> BGR16
  43  YV12 -> YV12
  44  BGR15 -> BGR16
  45  BGR16 -> BGR16
  46  YVU9 -> YV12
  47
  48 untested special converters
  49   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  50   YV12/I420 -> YV12/I420
  51   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  52   BGR24 -> BGR32 & RGB24 -> RGB32
  53   BGR32 -> BGR24 & RGB32 -> RGB24
  54   BGR24 -> YV12
  55 */
  56
  57 #define _SVID_SOURCE //needed for MAP_ANONYMOUS
  58 #include <inttypes.h>
  59 #include <string.h>
  60 #include <math.h>
  61 #include <stdio.h>
  62 #include "config.h"
  63 #include <assert.h>
  64 #if HAVE_SYS_MMAN_H
  65 #include <sys/mman.h>
  66 #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
  67 #define MAP_ANONYMOUS MAP_ANON
  68 #endif
  69 #endif
  70 #if HAVE_VIRTUALALLOC
  71 #define WIN32_LEAN_AND_MEAN
  72 #include <windows.h>
  73 #endif
  74 #include "swscale.h"
  75 #include "swscale_internal.h"
  76 #include "rgb2rgb.h"
  77 #include "libavutil/intreadwrite.h"
  78 #include "libavutil/x86_cpu.h"
  79 #include "libavutil/avutil.h"
  80 #include "libavutil/bswap.h"
  81 #include "libavutil/pixdesc.h"
  82
  83 unsigned swscale_version(void)
  84 {
  85     return LIBSWSCALE_VERSION_INT;
  86 }
  87
  88 const char * swscale_configuration(void)
  89 {
  90     return FFMPEG_CONFIGURATION;
  91 }
  92
  93 const char * swscale_license(void)
  94 {
  95 #define LICENSE_PREFIX "libswscale license: "
  96     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
  97 }
  98
  99 #undef MOVNTQ
 100 #undef PAVGB
 101
 102 //#undef HAVE_MMX2
 103 //#define HAVE_AMD3DNOW
 104 //#undef HAVE_MMX
 105 //#undef ARCH_X86
 106 #define DITHER1XBPP
 107
 108 #define FAST_BGR2YV12 // use 7 bit coefficients instead of 15 bit
 109
 110 #define RET 0xC3 //near return opcode for x86
 111
 112 #ifdef M_PI
 113 #define PI M_PI
 114 #else
 115 #define PI 3.14159265358979323846
 116 #endif
 117
 118 #define isSupportedIn(x)    (       \
 119            (x)==PIX_FMT_YUV420P     \
 120         || (x)==PIX_FMT_YUVA420P    \
 121         || (x)==PIX_FMT_YUYV422     \
 122         || (x)==PIX_FMT_UYVY422     \
 123         || (x)==PIX_FMT_RGB48BE     \
 124         || (x)==PIX_FMT_RGB48LE     \
 125         || (x)==PIX_FMT_RGB32       \
 126         || (x)==PIX_FMT_RGB32_1     \
 127         || (x)==PIX_FMT_BGR24       \
 128         || (x)==PIX_FMT_BGR565      \
 129         || (x)==PIX_FMT_BGR555      \
 130         || (x)==PIX_FMT_BGR32       \
 131         || (x)==PIX_FMT_BGR32_1     \
 132         || (x)==PIX_FMT_RGB24       \
 133         || (x)==PIX_FMT_RGB565      \
 134         || (x)==PIX_FMT_RGB555      \
 135         || (x)==PIX_FMT_GRAY8       \
 136         || (x)==PIX_FMT_YUV410P     \
 137         || (x)==PIX_FMT_YUV440P     \
 138         || (x)==PIX_FMT_NV12        \
 139         || (x)==PIX_FMT_NV21        \
 140         || (x)==PIX_FMT_GRAY16BE    \
 141         || (x)==PIX_FMT_GRAY16LE    \
 142         || (x)==PIX_FMT_YUV444P     \
 143         || (x)==PIX_FMT_YUV422P     \
 144         || (x)==PIX_FMT_YUV411P     \
 145         || (x)==PIX_FMT_PAL8        \
 146         || (x)==PIX_FMT_BGR8        \
 147         || (x)==PIX_FMT_RGB8        \
 148         || (x)==PIX_FMT_BGR4_BYTE   \
 149         || (x)==PIX_FMT_RGB4_BYTE   \
 150         || (x)==PIX_FMT_YUV440P     \
 151         || (x)==PIX_FMT_MONOWHITE   \
 152         || (x)==PIX_FMT_MONOBLACK   \
 153         || (x)==PIX_FMT_YUV420P16LE   \
 154         || (x)==PIX_FMT_YUV422P16LE   \
 155         || (x)==PIX_FMT_YUV444P16LE   \
 156         || (x)==PIX_FMT_YUV420P16BE   \
 157         || (x)==PIX_FMT_YUV422P16BE   \
 158         || (x)==PIX_FMT_YUV444P16BE   \
 159     )
 160 #define isSupportedOut(x)   (       \
 161            (x)==PIX_FMT_YUV420P     \
 162         || (x)==PIX_FMT_YUVA420P    \
 163         || (x)==PIX_FMT_YUYV422     \
 164         || (x)==PIX_FMT_UYVY422     \
 165         || (x)==PIX_FMT_YUV444P     \
 166         || (x)==PIX_FMT_YUV422P     \
 167         || (x)==PIX_FMT_YUV411P     \
 168         || isRGB(x)                 \
 169         || isBGR(x)                 \
 170         || (x)==PIX_FMT_NV12        \
 171         || (x)==PIX_FMT_NV21        \
 172         || (x)==PIX_FMT_GRAY16BE    \
 173         || (x)==PIX_FMT_GRAY16LE    \
 174         || (x)==PIX_FMT_GRAY8       \
 175         || (x)==PIX_FMT_YUV410P     \
 176         || (x)==PIX_FMT_YUV440P     \
 177         || (x)==PIX_FMT_YUV420P16LE   \
 178         || (x)==PIX_FMT_YUV422P16LE   \
 179         || (x)==PIX_FMT_YUV444P16LE   \
 180         || (x)==PIX_FMT_YUV420P16BE   \
 181         || (x)==PIX_FMT_YUV422P16BE   \
 182         || (x)==PIX_FMT_YUV444P16BE   \
 183     )
 184 #define isPacked(x)         (       \
 185            (x)==PIX_FMT_PAL8        \
 186         || (x)==PIX_FMT_YUYV422     \
 187         || (x)==PIX_FMT_UYVY422     \
 188         || isRGB(x)                 \
 189         || isBGR(x)                 \
 190     )
 191 #define usePal(x)           (       \
 192            (x)==PIX_FMT_PAL8        \
 193         || (x)==PIX_FMT_BGR4_BYTE   \
 194         || (x)==PIX_FMT_RGB4_BYTE   \
 195         || (x)==PIX_FMT_BGR8        \
 196         || (x)==PIX_FMT_RGB8        \
 197     )
 198
 199 #define RGB2YUV_SHIFT 15
 200 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
 201 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
 202 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
 203 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
 204 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
 205 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
 206 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
 207 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
 208 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
 209
 210 extern const int32_t ff_yuv2rgb_coeffs[8][4];
 211
 212 static const double rgb2yuv_table[8][9]={
 213     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5},
 214     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5},
 215     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5},
 216     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5},
 217     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
 218     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5},
 219     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //SMPTE 170M
 220     {0.701 , 0.087 , 0.212 , -0.384, 0.5  -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
 221 };
 222
 223 /*
 224 NOTES
 225 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
 226
 227 TODO
 228 more intelligent misalignment avoidance for the horizontal scaler
 229 write special vertical cubic upscale version
 230 optimize C code (YV12 / minmax)
 231 add support for packed pixel YUV input & output
 232 add support for Y8 output
 233 optimize BGR24 & BGR32
 234 add BGR4 output support
 235 write special BGR->BGR scaler
 236 */
 237
 238 #if ARCH_X86 && CONFIG_GPL
 239 DECLARE_ASM_CONST(8, uint64_t, bF8)=       0xF8F8F8F8F8F8F8F8LL;
 240 DECLARE_ASM_CONST(8, uint64_t, bFC)=       0xFCFCFCFCFCFCFCFCLL;
 241 DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
 242 DECLARE_ASM_CONST(8, uint64_t, w02)=       0x0002000200020002LL;
 243 DECLARE_ASM_CONST(8, uint64_t, bm00001111)=0x00000000FFFFFFFFLL;
 244 DECLARE_ASM_CONST(8, uint64_t, bm00000111)=0x0000000000FFFFFFLL;
 245 DECLARE_ASM_CONST(8, uint64_t, bm11111000)=0xFFFFFFFFFF000000LL;
 246 DECLARE_ASM_CONST(8, uint64_t, bm01010101)=0x00FF00FF00FF00FFLL;
 247
 248 const DECLARE_ALIGNED(8, uint64_t, ff_dither4[2]) = {
 249         0x0103010301030103LL,
 250         0x0200020002000200LL,};
 251
 252 const DECLARE_ALIGNED(8, uint64_t, ff_dither8[2]) = {
 253         0x0602060206020602LL,
 254         0x0004000400040004LL,};
 255
 256 DECLARE_ASM_CONST(8, uint64_t, b16Mask)=   0x001F001F001F001FLL;
 257 DECLARE_ASM_CONST(8, uint64_t, g16Mask)=   0x07E007E007E007E0LL;
 258 DECLARE_ASM_CONST(8, uint64_t, r16Mask)=   0xF800F800F800F800LL;
 259 DECLARE_ASM_CONST(8, uint64_t, b15Mask)=   0x001F001F001F001FLL;
 260 DECLARE_ASM_CONST(8, uint64_t, g15Mask)=   0x03E003E003E003E0LL;
 261 DECLARE_ASM_CONST(8, uint64_t, r15Mask)=   0x7C007C007C007C00LL;
 262
 263 DECLARE_ALIGNED(8, const uint64_t, ff_M24A)         = 0x00FF0000FF0000FFLL;
 264 DECLARE_ALIGNED(8, const uint64_t, ff_M24B)         = 0xFF0000FF0000FF00LL;
 265 DECLARE_ALIGNED(8, const uint64_t, ff_M24C)         = 0x0000FF0000FF0000LL;
 266
 267 #ifdef FAST_BGR2YV12
 268 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff)   = 0x000000210041000DULL;
 269 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff)   = 0x0000FFEEFFDC0038ULL;
 270 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff)   = 0x00000038FFD2FFF8ULL;
 271 #else
 272 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YCoeff)   = 0x000020E540830C8BULL;
 273 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UCoeff)   = 0x0000ED0FDAC23831ULL;
 274 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff)   = 0x00003831D0E6F6EAULL;
 275 #endif /* FAST_BGR2YV12 */
 276 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
 277 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
 278 DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
 279
 280 DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toY1Coeff) = 0x0C88000040870C88ULL;
 281 DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toY2Coeff) = 0x20DE4087000020DEULL;
 282 DECLARE_ASM_CONST(8, uint64_t, ff_rgb24toY1Coeff) = 0x20DE0000408720DEULL;
 283 DECLARE_ASM_CONST(8, uint64_t, ff_rgb24toY2Coeff) = 0x0C88408700000C88ULL;
 284 DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toYOffset) = 0x0008400000084000ULL;
 285
 286 DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toUV[2][4]) = {
 287     {0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL, 0x3838D0E300003838ULL},
 288     {0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL, 0x38380000D0E33838ULL, 0xF6E4D0E30000F6E4ULL},
 289 };
 290
 291 DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toUVOffset)= 0x0040400000404000ULL;
 292
 293 #endif /* ARCH_X86 && CONFIG_GPL */
 294
 295 // clipping helper table for C implementations:
 296 static unsigned char clip_table[768];
 297
 298 static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b);
 299
 300 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4[2][8])={
 301 {  1,   3,   1,   3,   1,   3,   1,   3, },
 302 {  2,   0,   2,   0,   2,   0,   2,   0, },
 303 };
 304
 305 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8[2][8])={
 306 {  6,   2,   6,   2,   6,   2,   6,   2, },
 307 {  0,   4,   0,   4,   0,   4,   0,   4, },
 308 };
 309
 310 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32[8][8])={
 311 { 17,   9,  23,  15,  16,   8,  22,  14, },
 312 {  5,  29,   3,  27,   4,  28,   2,  26, },
 313 { 21,  13,  19,  11,  20,  12,  18,  10, },
 314 {  0,  24,   6,  30,   1,  25,   7,  31, },
 315 { 16,   8,  22,  14,  17,   9,  23,  15, },
 316 {  4,  28,   2,  26,   5,  29,   3,  27, },
 317 { 20,  12,  18,  10,  21,  13,  19,  11, },
 318 {  1,  25,   7,  31,   0,  24,   6,  30, },
 319 };
 320
 321 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73[8][8])={
 322 {  0,  55,  14,  68,   3,  58,  17,  72, },
 323 { 37,  18,  50,  32,  40,  22,  54,  35, },
 324 {  9,  64,   5,  59,  13,  67,   8,  63, },
 325 { 46,  27,  41,  23,  49,  31,  44,  26, },
 326 {  2,  57,  16,  71,   1,  56,  15,  70, },
 327 { 39,  21,  52,  34,  38,  19,  51,  33, },
 328 { 11,  66,   7,  62,  10,  65,   6,  60, },
 329 { 48,  30,  43,  25,  47,  29,  42,  24, },
 330 };
 331
 332 #if 1
 333 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220[8][8])={
 334 {117,  62, 158, 103, 113,  58, 155, 100, },
 335 { 34, 199,  21, 186,  31, 196,  17, 182, },
 336 {144,  89, 131,  76, 141,  86, 127,  72, },
 337 {  0, 165,  41, 206,  10, 175,  52, 217, },
 338 {110,  55, 151,  96, 120,  65, 162, 107, },
 339 { 28, 193,  14, 179,  38, 203,  24, 189, },
 340 {138,  83, 124,  69, 148,  93, 134,  79, },
 341 {  7, 172,  48, 213,   3, 168,  45, 210, },
 342 };
 343 #elif 1
 344 // tries to correct a gamma of 1.5
 345 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220[8][8])={
 346 {  0, 143,  18, 200,   2, 156,  25, 215, },
 347 { 78,  28, 125,  64,  89,  36, 138,  74, },
 348 { 10, 180,   3, 161,  16, 195,   8, 175, },
 349 {109,  51,  93,  38, 121,  60, 105,  47, },
 350 {  1, 152,  23, 210,   0, 147,  20, 205, },
 351 { 85,  33, 134,  71,  81,  30, 130,  67, },
 352 { 14, 190,   6, 171,  12, 185,   5, 166, },
 353 {117,  57, 101,  44, 113,  54,  97,  41, },
 354 };
 355 #elif 1
 356 // tries to correct a gamma of 2.0
 357 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220[8][8])={
 358 {  0, 124,   8, 193,   0, 140,  12, 213, },
 359 { 55,  14, 104,  42,  66,  19, 119,  52, },
 360 {  3, 168,   1, 145,   6, 187,   3, 162, },
 361 { 86,  31,  70,  21,  99,  39,  82,  28, },
 362 {  0, 134,  11, 206,   0, 129,   9, 200, },
 363 { 62,  17, 114,  48,  58,  16, 109,  45, },
 364 {  5, 181,   2, 157,   4, 175,   1, 151, },
 365 { 95,  36,  78,  26,  90,  34,  74,  24, },
 366 };
 367 #else
 368 // tries to correct a gamma of 2.5
 369 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220[8][8])={
 370 {  0, 107,   3, 187,   0, 125,   6, 212, },
 371 { 39,   7,  86,  28,  49,  11, 102,  36, },
 372 {  1, 158,   0, 131,   3, 180,   1, 151, },
 373 { 68,  19,  52,  12,  81,  25,  64,  17, },
 374 {  0, 119,   5, 203,   0, 113,   4, 195, },
 375 { 45,   9,  96,  33,  42,   8,  91,  30, },
 376 {  2, 172,   1, 144,   2, 165,   0, 137, },
 377 { 77,  23,  60,  15,  72,  21,  56,  14, },
 378 };
 379 #endif
 380
 381 const char *sws_format_name(enum PixelFormat format)
 382 {
 383     if ((unsigned)format < PIX_FMT_NB && av_pix_fmt_descriptors[format].name)
 384         return av_pix_fmt_descriptors[format].name;
 385     else
 386         return "Unknown format";
 387 }
 388
 389 static av_always_inline void yuv2yuvX16inC_template(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 390                                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
 391                                                     const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest,
 392                                                     int dstW, int chrDstW, int big_endian)
 393 {
 394     //FIXME Optimize (just quickly written not optimized..)
 395     int i;
 396
 397     for (i = 0; i < dstW; i++) {
 398         int val = 1 << 10;
 399         int j;
 400
 401         for (j = 0; j < lumFilterSize; j++)
 402             val += lumSrc[j][i] * lumFilter[j];
 403
 404         if (big_endian) {
 405             AV_WB16(&dest[i], av_clip_uint16(val >> 11));
 406         } else {
 407             AV_WL16(&dest[i], av_clip_uint16(val >> 11));
 408         }
 409     }
 410
 411     if (uDest) {
 412         for (i = 0; i < chrDstW; i++) {
 413             int u = 1 << 10;
 414             int v = 1 << 10;
 415             int j;
 416
 417             for (j = 0; j < chrFilterSize; j++) {
 418                 u += chrSrc[j][i       ] * chrFilter[j];
 419                 v += chrSrc[j][i + VOFW] * chrFilter[j];
 420             }
 421
 422             if (big_endian) {
 423                 AV_WB16(&uDest[i], av_clip_uint16(u >> 11));
 424                 AV_WB16(&vDest[i], av_clip_uint16(v >> 11));
 425             } else {
 426                 AV_WL16(&uDest[i], av_clip_uint16(u >> 11));
 427                 AV_WL16(&vDest[i], av_clip_uint16(v >> 11));
 428             }
 429         }
 430     }
 431
 432     if (CONFIG_SWSCALE_ALPHA && aDest) {
 433         for (i = 0; i < dstW; i++) {
 434             int val = 1 << 10;
 435             int j;
 436
 437             for (j = 0; j < lumFilterSize; j++)
 438                 val += alpSrc[j][i] * lumFilter[j];
 439
 440             if (big_endian) {
 441                 AV_WB16(&aDest[i], av_clip_uint16(val >> 11));
 442             } else {
 443                 AV_WL16(&aDest[i], av_clip_uint16(val >> 11));
 444             }
 445         }
 446     }
 447 }
 448
 449 static inline void yuv2yuvX16inC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 450                                  const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
 451                                  const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest, int dstW, int chrDstW,
 452                                  enum PixelFormat dstFormat)
 453 {
 454     if (isBE(dstFormat)) {
 455         yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
 456                                chrFilter, chrSrc, chrFilterSize,
 457                                alpSrc,
 458                                dest, uDest, vDest, aDest,
 459                                dstW, chrDstW, 1);
 460     } else {
 461         yuv2yuvX16inC_template(lumFilter, lumSrc, lumFilterSize,
 462                                chrFilter, chrSrc, chrFilterSize,
 463                                alpSrc,
 464                                dest, uDest, vDest, aDest,
 465                                dstW, chrDstW, 0);
 466     }
 467 }
 468
 469 static inline void yuv2yuvXinC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 470                                const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
 471                                const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, int dstW, int chrDstW)
 472 {
 473     //FIXME Optimize (just quickly written not optimized..)
 474     int i;
 475     for (i=0; i<dstW; i++) {
 476         int val=1<<18;
 477         int j;
 478         for (j=0; j<lumFilterSize; j++)
 479             val += lumSrc[j][i] * lumFilter[j];
 480
 481         dest[i]= av_clip_uint8(val>>19);
 482     }
 483
 484     if (uDest)
 485         for (i=0; i<chrDstW; i++) {
 486             int u=1<<18;
 487             int v=1<<18;
 488             int j;
 489             for (j=0; j<chrFilterSize; j++) {
 490                 u += chrSrc[j][i] * chrFilter[j];
 491                 v += chrSrc[j][i + VOFW] * chrFilter[j];
 492             }
 493
 494             uDest[i]= av_clip_uint8(u>>19);
 495             vDest[i]= av_clip_uint8(v>>19);
 496         }
 497
 498     if (CONFIG_SWSCALE_ALPHA && aDest)
 499         for (i=0; i<dstW; i++) {
 500             int val=1<<18;
 501             int j;
 502             for (j=0; j<lumFilterSize; j++)
 503                 val += alpSrc[j][i] * lumFilter[j];
 504
 505             aDest[i]= av_clip_uint8(val>>19);
 506         }
 507
 508 }
 509
 510 static inline void yuv2nv12XinC(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 511                                 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
 512                                 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
 513 {
 514     //FIXME Optimize (just quickly written not optimized..)
 515     int i;
 516     for (i=0; i<dstW; i++) {
 517         int val=1<<18;
 518         int j;
 519         for (j=0; j<lumFilterSize; j++)
 520             val += lumSrc[j][i] * lumFilter[j];
 521
 522         dest[i]= av_clip_uint8(val>>19);
 523     }
 524
 525     if (!uDest)
 526         return;
 527
 528     if (dstFormat == PIX_FMT_NV12)
 529         for (i=0; i<chrDstW; i++) {
 530             int u=1<<18;
 531             int v=1<<18;
 532             int j;
 533             for (j=0; j<chrFilterSize; j++) {
 534                 u += chrSrc[j][i] * chrFilter[j];
 535                 v += chrSrc[j][i + VOFW] * chrFilter[j];
 536             }
 537
 538             uDest[2*i]= av_clip_uint8(u>>19);
 539             uDest[2*i+1]= av_clip_uint8(v>>19);
 540         }
 541     else
 542         for (i=0; i<chrDstW; i++) {
 543             int u=1<<18;
 544             int v=1<<18;
 545             int j;
 546             for (j=0; j<chrFilterSize; j++) {
 547                 u += chrSrc[j][i] * chrFilter[j];
 548                 v += chrSrc[j][i + VOFW] * chrFilter[j];
 549             }
 550
 551             uDest[2*i]= av_clip_uint8(v>>19);
 552             uDest[2*i+1]= av_clip_uint8(u>>19);
 553         }
 554 }
 555
 556 #define YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha) \
 557     for (i=0; i<(dstW>>1); i++) {\
 558         int j;\
 559         int Y1 = 1<<18;\
 560         int Y2 = 1<<18;\
 561         int U  = 1<<18;\
 562         int V  = 1<<18;\
 563         int av_unused A1, A2;\
 564         type av_unused *r, *b, *g;\
 565         const int i2= 2*i;\
 566         \
 567         for (j=0; j<lumFilterSize; j++) {\
 568             Y1 += lumSrc[j][i2] * lumFilter[j];\
 569             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 570         }\
 571         for (j=0; j<chrFilterSize; j++) {\
 572             U += chrSrc[j][i] * chrFilter[j];\
 573             V += chrSrc[j][i+VOFW] * chrFilter[j];\
 574         }\
 575         Y1>>=19;\
 576         Y2>>=19;\
 577         U >>=19;\
 578         V >>=19;\
 579         if (alpha) {\
 580             A1 = 1<<18;\
 581             A2 = 1<<18;\
 582             for (j=0; j<lumFilterSize; j++) {\
 583                 A1 += alpSrc[j][i2  ] * lumFilter[j];\
 584                 A2 += alpSrc[j][i2+1] * lumFilter[j];\
 585             }\
 586             A1>>=19;\
 587             A2>>=19;\
 588         }\
 589
 590 #define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
 591         YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha)\
 592         if ((Y1|Y2|U|V)&256) {\
 593             if (Y1>255)   Y1=255; \
 594             else if (Y1<0)Y1=0;   \
 595             if (Y2>255)   Y2=255; \
 596             else if (Y2<0)Y2=0;   \
 597             if (U>255)    U=255;  \
 598             else if (U<0) U=0;    \
 599             if (V>255)    V=255;  \
 600             else if (V<0) V=0;    \
 601         }\
 602         if (alpha && ((A1|A2)&256)) {\
 603             A1=av_clip_uint8(A1);\
 604             A2=av_clip_uint8(A2);\
 605         }
 606
 607 #define YSCALE_YUV_2_PACKEDX_FULL_C(rnd,alpha) \
 608     for (i=0; i<dstW; i++) {\
 609         int j;\
 610         int Y = 0;\
 611         int U = -128<<19;\
 612         int V = -128<<19;\
 613         int av_unused A;\
 614         int R,G,B;\
 615         \
 616         for (j=0; j<lumFilterSize; j++) {\
 617             Y += lumSrc[j][i     ] * lumFilter[j];\
 618         }\
 619         for (j=0; j<chrFilterSize; j++) {\
 620             U += chrSrc[j][i     ] * chrFilter[j];\
 621             V += chrSrc[j][i+VOFW] * chrFilter[j];\
 622         }\
 623         Y >>=10;\
 624         U >>=10;\
 625         V >>=10;\
 626         if (alpha) {\
 627             A = rnd;\
 628             for (j=0; j<lumFilterSize; j++)\
 629                 A += alpSrc[j][i     ] * lumFilter[j];\
 630             A >>=19;\
 631             if (A&256)\
 632                 A = av_clip_uint8(A);\
 633         }\
 634
 635 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
 636     YSCALE_YUV_2_PACKEDX_FULL_C(rnd>>3,alpha)\
 637         Y-= c->yuv2rgb_y_offset;\
 638         Y*= c->yuv2rgb_y_coeff;\
 639         Y+= rnd;\
 640         R= Y + V*c->yuv2rgb_v2r_coeff;\
 641         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
 642         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
 643         if ((R|G|B)&(0xC0000000)) {\
 644             if (R>=(256<<22))   R=(256<<22)-1; \
 645             else if (R<0)R=0;   \
 646             if (G>=(256<<22))   G=(256<<22)-1; \
 647             else if (G<0)G=0;   \
 648             if (B>=(256<<22))   B=(256<<22)-1; \
 649             else if (B<0)B=0;   \
 650         }\
 651
 652
 653 #define YSCALE_YUV_2_GRAY16_C \
 654     for (i=0; i<(dstW>>1); i++) {\
 655         int j;\
 656         int Y1 = 1<<18;\
 657         int Y2 = 1<<18;\
 658         int U  = 1<<18;\
 659         int V  = 1<<18;\
 660         \
 661         const int i2= 2*i;\
 662         \
 663         for (j=0; j<lumFilterSize; j++) {\
 664             Y1 += lumSrc[j][i2] * lumFilter[j];\
 665             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 666         }\
 667         Y1>>=11;\
 668         Y2>>=11;\
 669         if ((Y1|Y2|U|V)&65536) {\
 670             if (Y1>65535)   Y1=65535; \
 671             else if (Y1<0)Y1=0;   \
 672             if (Y2>65535)   Y2=65535; \
 673             else if (Y2<0)Y2=0;   \
 674         }
 675
 676 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
 677     YSCALE_YUV_2_PACKEDX_C(type,alpha)  /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
 678     r = (type *)c->table_rV[V];   \
 679     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
 680     b = (type *)c->table_bU[U];   \
 681
 682 #define YSCALE_YUV_2_PACKED2_C(type,alpha)   \
 683     for (i=0; i<(dstW>>1); i++) { \
 684         const int i2= 2*i;       \
 685         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>19;           \
 686         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;           \
 687         int U= (uvbuf0[i     ]*uvalpha1+uvbuf1[i     ]*uvalpha)>>19;  \
 688         int V= (uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19;  \
 689         type av_unused *r, *b, *g;                                    \
 690         int av_unused A1, A2;                                         \
 691         if (alpha) {\
 692             A1= (abuf0[i2  ]*yalpha1+abuf1[i2  ]*yalpha)>>19;         \
 693             A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
 694         }\
 695
 696 #define YSCALE_YUV_2_GRAY16_2_C   \
 697     for (i=0; i<(dstW>>1); i++) { \
 698         const int i2= 2*i;       \
 699         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>11;           \
 700         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>11;           \
 701
 702 #define YSCALE_YUV_2_RGB2_C(type,alpha) \
 703     YSCALE_YUV_2_PACKED2_C(type,alpha)\
 704     r = (type *)c->table_rV[V];\
 705     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 706     b = (type *)c->table_bU[U];\
 707
 708 #define YSCALE_YUV_2_PACKED1_C(type,alpha) \
 709     for (i=0; i<(dstW>>1); i++) {\
 710         const int i2= 2*i;\
 711         int Y1= buf0[i2  ]>>7;\
 712         int Y2= buf0[i2+1]>>7;\
 713         int U= (uvbuf1[i     ])>>7;\
 714         int V= (uvbuf1[i+VOFW])>>7;\
 715         type av_unused *r, *b, *g;\
 716         int av_unused A1, A2;\
 717         if (alpha) {\
 718             A1= abuf0[i2  ]>>7;\
 719             A2= abuf0[i2+1]>>7;\
 720         }\
 721
 722 #define YSCALE_YUV_2_GRAY16_1_C \
 723     for (i=0; i<(dstW>>1); i++) {\
 724         const int i2= 2*i;\
 725         int Y1= buf0[i2  ]<<1;\
 726         int Y2= buf0[i2+1]<<1;\
 727
 728 #define YSCALE_YUV_2_RGB1_C(type,alpha) \
 729     YSCALE_YUV_2_PACKED1_C(type,alpha)\
 730     r = (type *)c->table_rV[V];\
 731     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 732     b = (type *)c->table_bU[U];\
 733
 734 #define YSCALE_YUV_2_PACKED1B_C(type,alpha) \
 735     for (i=0; i<(dstW>>1); i++) {\
 736         const int i2= 2*i;\
 737         int Y1= buf0[i2  ]>>7;\
 738         int Y2= buf0[i2+1]>>7;\
 739         int U= (uvbuf0[i     ] + uvbuf1[i     ])>>8;\
 740         int V= (uvbuf0[i+VOFW] + uvbuf1[i+VOFW])>>8;\
 741         type av_unused *r, *b, *g;\
 742         int av_unused A1, A2;\
 743         if (alpha) {\
 744             A1= abuf0[i2  ]>>7;\
 745             A2= abuf0[i2+1]>>7;\
 746         }\
 747
 748 #define YSCALE_YUV_2_RGB1B_C(type,alpha) \
 749     YSCALE_YUV_2_PACKED1B_C(type,alpha)\
 750     r = (type *)c->table_rV[V];\
 751     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 752     b = (type *)c->table_bU[U];\
 753
 754 #define YSCALE_YUV_2_MONO2_C \
 755     const uint8_t * const d128=dither_8x8_220[y&7];\
 756     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 757     for (i=0; i<dstW-7; i+=8) {\
 758         int acc;\
 759         acc =       g[((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19) + d128[0]];\
 760         acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
 761         acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
 762         acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
 763         acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
 764         acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
 765         acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
 766         acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
 767         ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 768         dest++;\
 769     }\
 770
 771
 772 #define YSCALE_YUV_2_MONOX_C \
 773     const uint8_t * const d128=dither_8x8_220[y&7];\
 774     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 775     int acc=0;\
 776     for (i=0; i<dstW-1; i+=2) {\
 777         int j;\
 778         int Y1=1<<18;\
 779         int Y2=1<<18;\
 780 \
 781         for (j=0; j<lumFilterSize; j++) {\
 782             Y1 += lumSrc[j][i] * lumFilter[j];\
 783             Y2 += lumSrc[j][i+1] * lumFilter[j];\
 784         }\
 785         Y1>>=19;\
 786         Y2>>=19;\
 787         if ((Y1|Y2)&256) {\
 788             if (Y1>255)   Y1=255;\
 789             else if (Y1<0)Y1=0;\
 790             if (Y2>255)   Y2=255;\
 791             else if (Y2<0)Y2=0;\
 792         }\
 793         acc+= acc + g[Y1+d128[(i+0)&7]];\
 794         acc+= acc + g[Y2+d128[(i+1)&7]];\
 795         if ((i&7)==6) {\
 796             ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 797             dest++;\
 798         }\
 799     }
 800
 801
 802 #define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
 803     switch(c->dstFormat) {\
 804     case PIX_FMT_RGB48BE:\
 805     case PIX_FMT_RGB48LE:\
 806         func(uint8_t,0)\
 807             ((uint8_t*)dest)[ 0]= r[Y1];\
 808             ((uint8_t*)dest)[ 1]= r[Y1];\
 809             ((uint8_t*)dest)[ 2]= g[Y1];\
 810             ((uint8_t*)dest)[ 3]= g[Y1];\
 811             ((uint8_t*)dest)[ 4]= b[Y1];\
 812             ((uint8_t*)dest)[ 5]= b[Y1];\
 813             ((uint8_t*)dest)[ 6]= r[Y2];\
 814             ((uint8_t*)dest)[ 7]= r[Y2];\
 815             ((uint8_t*)dest)[ 8]= g[Y2];\
 816             ((uint8_t*)dest)[ 9]= g[Y2];\
 817             ((uint8_t*)dest)[10]= b[Y2];\
 818             ((uint8_t*)dest)[11]= b[Y2];\
 819             dest+=12;\
 820         }\
 821         break;\
 822     case PIX_FMT_RGBA:\
 823     case PIX_FMT_BGRA:\
 824         if (CONFIG_SMALL) {\
 825             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 826             func(uint32_t,needAlpha)\
 827                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? (A1<<24) : 0);\
 828                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? (A2<<24) : 0);\
 829             }\
 830         } else {\
 831             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 832                 func(uint32_t,1)\
 833                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
 834                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
 835                 }\
 836             } else {\
 837                 func(uint32_t,0)\
 838                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 839                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 840                 }\
 841             }\
 842         }\
 843         break;\
 844     case PIX_FMT_ARGB:\
 845     case PIX_FMT_ABGR:\
 846         if (CONFIG_SMALL) {\
 847             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 848             func(uint32_t,needAlpha)\
 849                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? A1 : 0);\
 850                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? A2 : 0);\
 851             }\
 852         } else {\
 853             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 854                 func(uint32_t,1)\
 855                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + A1;\
 856                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + A2;\
 857                 }\
 858             } else {\
 859                 func(uint32_t,0)\
 860                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 861                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 862                 }\
 863             }\
 864         }                \
 865         break;\
 866     case PIX_FMT_RGB24:\
 867         func(uint8_t,0)\
 868             ((uint8_t*)dest)[0]= r[Y1];\
 869             ((uint8_t*)dest)[1]= g[Y1];\
 870             ((uint8_t*)dest)[2]= b[Y1];\
 871             ((uint8_t*)dest)[3]= r[Y2];\
 872             ((uint8_t*)dest)[4]= g[Y2];\
 873             ((uint8_t*)dest)[5]= b[Y2];\
 874             dest+=6;\
 875         }\
 876         break;\
 877     case PIX_FMT_BGR24:\
 878         func(uint8_t,0)\
 879             ((uint8_t*)dest)[0]= b[Y1];\
 880             ((uint8_t*)dest)[1]= g[Y1];\
 881             ((uint8_t*)dest)[2]= r[Y1];\
 882             ((uint8_t*)dest)[3]= b[Y2];\
 883             ((uint8_t*)dest)[4]= g[Y2];\
 884             ((uint8_t*)dest)[5]= r[Y2];\
 885             dest+=6;\
 886         }\
 887         break;\
 888     case PIX_FMT_RGB565:\
 889     case PIX_FMT_BGR565:\
 890         {\
 891             const int dr1= dither_2x2_8[y&1    ][0];\
 892             const int dg1= dither_2x2_4[y&1    ][0];\
 893             const int db1= dither_2x2_8[(y&1)^1][0];\
 894             const int dr2= dither_2x2_8[y&1    ][1];\
 895             const int dg2= dither_2x2_4[y&1    ][1];\
 896             const int db2= dither_2x2_8[(y&1)^1][1];\
 897             func(uint16_t,0)\
 898                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 899                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 900             }\
 901         }\
 902         break;\
 903     case PIX_FMT_RGB555:\
 904     case PIX_FMT_BGR555:\
 905         {\
 906             const int dr1= dither_2x2_8[y&1    ][0];\
 907             const int dg1= dither_2x2_8[y&1    ][1];\
 908             const int db1= dither_2x2_8[(y&1)^1][0];\
 909             const int dr2= dither_2x2_8[y&1    ][1];\
 910             const int dg2= dither_2x2_8[y&1    ][0];\
 911             const int db2= dither_2x2_8[(y&1)^1][1];\
 912             func(uint16_t,0)\
 913                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 914                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 915             }\
 916         }\
 917         break;\
 918     case PIX_FMT_RGB8:\
 919     case PIX_FMT_BGR8:\
 920         {\
 921             const uint8_t * const d64= dither_8x8_73[y&7];\
 922             const uint8_t * const d32= dither_8x8_32[y&7];\
 923             func(uint8_t,0)\
 924                 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
 925                 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
 926             }\
 927         }\
 928         break;\
 929     case PIX_FMT_RGB4:\
 930     case PIX_FMT_BGR4:\
 931         {\
 932             const uint8_t * const d64= dither_8x8_73 [y&7];\
 933             const uint8_t * const d128=dither_8x8_220[y&7];\
 934             func(uint8_t,0)\
 935                 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
 936                                  + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
 937             }\
 938         }\
 939         break;\
 940     case PIX_FMT_RGB4_BYTE:\
 941     case PIX_FMT_BGR4_BYTE:\
 942         {\
 943             const uint8_t * const d64= dither_8x8_73 [y&7];\
 944             const uint8_t * const d128=dither_8x8_220[y&7];\
 945             func(uint8_t,0)\
 946                 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
 947                 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
 948             }\
 949         }\
 950         break;\
 951     case PIX_FMT_MONOBLACK:\
 952     case PIX_FMT_MONOWHITE:\
 953         {\
 954             func_monoblack\
 955         }\
 956         break;\
 957     case PIX_FMT_YUYV422:\
 958         func2\
 959             ((uint8_t*)dest)[2*i2+0]= Y1;\
 960             ((uint8_t*)dest)[2*i2+1]= U;\
 961             ((uint8_t*)dest)[2*i2+2]= Y2;\
 962             ((uint8_t*)dest)[2*i2+3]= V;\
 963         }                \
 964         break;\
 965     case PIX_FMT_UYVY422:\
 966         func2\
 967             ((uint8_t*)dest)[2*i2+0]= U;\
 968             ((uint8_t*)dest)[2*i2+1]= Y1;\
 969             ((uint8_t*)dest)[2*i2+2]= V;\
 970             ((uint8_t*)dest)[2*i2+3]= Y2;\
 971         }                \
 972         break;\
 973     case PIX_FMT_GRAY16BE:\
 974         func_g16\
 975             ((uint8_t*)dest)[2*i2+0]= Y1>>8;\
 976             ((uint8_t*)dest)[2*i2+1]= Y1;\
 977             ((uint8_t*)dest)[2*i2+2]= Y2>>8;\
 978             ((uint8_t*)dest)[2*i2+3]= Y2;\
 979         }                \
 980         break;\
 981     case PIX_FMT_GRAY16LE:\
 982         func_g16\
 983             ((uint8_t*)dest)[2*i2+0]= Y1;\
 984             ((uint8_t*)dest)[2*i2+1]= Y1>>8;\
 985             ((uint8_t*)dest)[2*i2+2]= Y2;\
 986             ((uint8_t*)dest)[2*i2+3]= Y2>>8;\
 987         }                \
 988         break;\
 989     }\
 990
 991
 992 static inline void yuv2packedXinC(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 993                                   const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
 994                                   const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 995 {
 996     int i;
 997     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
 998 }
 999
1000 static inline void yuv2rgbXinC_full(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1001                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1002                                     const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
1003 {
1004     int i;
1005     int step= fmt_depth(c->dstFormat)/8;
1006     int aidx= 3;
1007
1008     switch(c->dstFormat) {
1009     case PIX_FMT_ARGB:
1010         dest++;
1011         aidx= 0;
1012     case PIX_FMT_RGB24:
1013         aidx--;
1014     case PIX_FMT_RGBA:
1015         if (CONFIG_SMALL) {
1016             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
1017             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
1018                 dest[aidx]= needAlpha ? A : 255;
1019                 dest[0]= R>>22;
1020                 dest[1]= G>>22;
1021                 dest[2]= B>>22;
1022                 dest+= step;
1023             }
1024         } else {
1025             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1026                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
1027                     dest[aidx]= A;
1028                     dest[0]= R>>22;
1029                     dest[1]= G>>22;
1030                     dest[2]= B>>22;
1031                     dest+= step;
1032                 }
1033             } else {
1034                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
1035                     dest[aidx]= 255;
1036                     dest[0]= R>>22;
1037                     dest[1]= G>>22;
1038                     dest[2]= B>>22;
1039                     dest+= step;
1040                 }
1041             }
1042         }
1043         break;
1044     case PIX_FMT_ABGR:
1045         dest++;
1046         aidx= 0;
1047     case PIX_FMT_BGR24:
1048         aidx--;
1049     case PIX_FMT_BGRA:
1050         if (CONFIG_SMALL) {
1051             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
1052             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
1053                 dest[aidx]= needAlpha ? A : 255;
1054                 dest[0]= B>>22;
1055                 dest[1]= G>>22;
1056                 dest[2]= R>>22;
1057                 dest+= step;
1058             }
1059         } else {
1060             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1061                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
1062                     dest[aidx]= A;
1063                     dest[0]= B>>22;
1064                     dest[1]= G>>22;
1065                     dest[2]= R>>22;
1066                     dest+= step;
1067                 }
1068             } else {
1069                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
1070                     dest[aidx]= 255;
1071                     dest[0]= B>>22;
1072                     dest[1]= G>>22;
1073                     dest[2]= R>>22;
1074                     dest+= step;
1075                 }
1076             }
1077         }
1078         break;
1079     default:
1080         assert(0);
1081     }
1082 }
1083
1084 static void fillPlane(uint8_t* plane, int stride, int width, int height, int y, uint8_t val)
1085 {
1086     int i;
1087     uint8_t *ptr = plane + stride*y;
1088     for (i=0; i<height; i++) {
1089         memset(ptr, val, width);
1090         ptr += stride;
1091     }
1092 }
1093
1094 static inline void rgb48ToY(uint8_t *dst, const uint8_t *src, int width)
1095 {
1096     int i;
1097     for (i = 0; i < width; i++) {
1098         int r = src[i*6+0];
1099         int g = src[i*6+2];
1100         int b = src[i*6+4];
1101
1102         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1103     }
1104 }
1105
1106 static inline void rgb48ToUV(uint8_t *dstU, uint8_t *dstV,
1107                              uint8_t *src1, uint8_t *src2, int width)
1108 {
1109     int i;
1110     assert(src1==src2);
1111     for (i = 0; i < width; i++) {
1112         int r = src1[6*i + 0];
1113         int g = src1[6*i + 2];
1114         int b = src1[6*i + 4];
1115
1116         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1117         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1118     }
1119 }
1120
1121 static inline void rgb48ToUV_half(uint8_t *dstU, uint8_t *dstV,
1122                                   uint8_t *src1, uint8_t *src2, int width)
1123 {
1124     int i;
1125     assert(src1==src2);
1126     for (i = 0; i < width; i++) {
1127         int r= src1[12*i + 0] + src1[12*i + 6];
1128         int g= src1[12*i + 2] + src1[12*i + 8];
1129         int b= src1[12*i + 4] + src1[12*i + 10];
1130
1131         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1132         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1133     }
1134 }
1135
1136 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1137 static inline void name(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)\
1138 {\
1139     int i;\
1140     for (i=0; i<width; i++) {\
1141         int b= (((const type*)src)[i]>>shb)&maskb;\
1142         int g= (((const type*)src)[i]>>shg)&maskg;\
1143         int r= (((const type*)src)[i]>>shr)&maskr;\
1144 \
1145         dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1146     }\
1147 }
1148
1149 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1150 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1151 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1152 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1153 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1154 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1155
1156 static inline void abgrToA(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1157 {
1158     int i;
1159     for (i=0; i<width; i++) {
1160         dst[i]= src[4*i];
1161     }
1162 }
1163
1164 #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1165 static inline void name(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
1166 {\
1167     int i;\
1168     for (i=0; i<width; i++) {\
1169         int b= (((const type*)src)[i]&maskb)>>shb;\
1170         int g= (((const type*)src)[i]&maskg)>>shg;\
1171         int r= (((const type*)src)[i]&maskr)>>shr;\
1172 \
1173         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1174         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1175     }\
1176 }\
1177 static inline void name ## _half(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
1178 {\
1179     int i;\
1180     for (i=0; i<width; i++) {\
1181         int pix0= ((const type*)src)[2*i+0];\
1182         int pix1= ((const type*)src)[2*i+1];\
1183         int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1184         int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1185         int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1186         g&= maskg|(2*maskg);\
1187 \
1188         g>>=shg;\
1189 \
1190         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1191         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1192     }\
1193 }
1194
1195 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1196 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1197 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0,          0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1198 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0,          0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1199 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0,          0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1200 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0,          0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1201
1202 static inline void palToY(uint8_t *dst, const uint8_t *src, long width, uint32_t *pal)
1203 {
1204     int i;
1205     for (i=0; i<width; i++) {
1206         int d= src[i];
1207
1208         dst[i]= pal[d] & 0xFF;
1209     }
1210 }
1211
1212 static inline void palToUV(uint8_t *dstU, uint8_t *dstV,
1213                            const uint8_t *src1, const uint8_t *src2,
1214                            long width, uint32_t *pal)
1215 {
1216     int i;
1217     assert(src1 == src2);
1218     for (i=0; i<width; i++) {
1219         int p= pal[src1[i]];
1220
1221         dstU[i]= p>>8;
1222         dstV[i]= p>>16;
1223     }
1224 }
1225
1226 static inline void monowhite2Y(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1227 {
1228     int i, j;
1229     for (i=0; i<width/8; i++) {
1230         int d= ~src[i];
1231         for(j=0; j<8; j++)
1232             dst[8*i+j]= ((d>>(7-j))&1)*255;
1233     }
1234 }
1235
1236 static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1237 {
1238     int i, j;
1239     for (i=0; i<width/8; i++) {
1240         int d= src[i];
1241         for(j=0; j<8; j++)
1242             dst[8*i+j]= ((d>>(7-j))&1)*255;
1243     }
1244 }
1245
1246
1247 //Note: we have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW+MMX2 one
1248 //Plain C versions
1249 #if ((!HAVE_MMX || !CONFIG_GPL) && !HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT
1250 #define COMPILE_C
1251 #endif
1252
1253 #if ARCH_PPC
1254 #if HAVE_ALTIVEC || CONFIG_RUNTIME_CPUDETECT
1255 #define COMPILE_ALTIVEC
1256 #endif
1257 #endif //ARCH_PPC
1258
1259 #if ARCH_X86
1260
1261 #if ((HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT) && CONFIG_GPL
1262 #define COMPILE_MMX
1263 #endif
1264
1265 #if (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT) && CONFIG_GPL
1266 #define COMPILE_MMX2
1267 #endif
1268
1269 #if ((HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT) && CONFIG_GPL
1270 #define COMPILE_3DNOW
1271 #endif
1272 #endif //ARCH_X86
1273
1274 #define COMPILE_TEMPLATE_MMX 0
1275 #define COMPILE_TEMPLATE_MMX2 0
1276 #define COMPILE_TEMPLATE_AMD3DNOW 0
1277 #define COMPILE_TEMPLATE_ALTIVEC 0
1278
1279 #ifdef COMPILE_C
1280 #define RENAME(a) a ## _C
1281 #include "swscale_template.c"
1282 #endif
1283
1284 #ifdef COMPILE_ALTIVEC
1285 #undef RENAME
1286 #undef COMPILE_TEMPLATE_ALTIVEC
1287 #define COMPILE_TEMPLATE_ALTIVEC 1
1288 #define RENAME(a) a ## _altivec
1289 #include "swscale_template.c"
1290 #endif
1291
1292 #if ARCH_X86
1293
1294 //MMX versions
1295 #ifdef COMPILE_MMX
1296 #undef RENAME
1297 #undef COMPILE_TEMPLATE_MMX
1298 #undef COMPILE_TEMPLATE_MMX2
1299 #undef COMPILE_TEMPLATE_AMD3DNOW
1300 #define COMPILE_TEMPLATE_MMX 1
1301 #define COMPILE_TEMPLATE_MMX2 0
1302 #define COMPILE_TEMPLATE_AMD3DNOW 0
1303 #define RENAME(a) a ## _MMX
1304 #include "swscale_template.c"
1305 #endif
1306
1307 //MMX2 versions
1308 #ifdef COMPILE_MMX2
1309 #undef RENAME
1310 #undef COMPILE_TEMPLATE_MMX
1311 #undef COMPILE_TEMPLATE_MMX2
1312 #undef COMPILE_TEMPLATE_AMD3DNOW
1313 #define COMPILE_TEMPLATE_MMX 1
1314 #define COMPILE_TEMPLATE_MMX2 1
1315 #define COMPILE_TEMPLATE_AMD3DNOW 0
1316 #define RENAME(a) a ## _MMX2
1317 #include "swscale_template.c"
1318 #endif
1319
1320 //3DNOW versions
1321 #ifdef COMPILE_3DNOW
1322 #undef RENAME
1323 #undef COMPILE_TEMPLATE_MMX
1324 #undef COMPILE_TEMPLATE_MMX2
1325 #undef COMPILE_TEMPLATE_AMD3DNOW
1326 #define COMPILE_TEMPLATE_MMX 1
1327 #define COMPILE_TEMPLATE_MMX2 0
1328 #define COMPILE_TEMPLATE_AMD3DNOW 1
1329 #define RENAME(a) a ## _3DNow
1330 #include "swscale_template.c"
1331 #endif
1332
1333 #endif //ARCH_X86
1334
1335 static double getSplineCoeff(double a, double b, double c, double d, double dist)
1336 {
1337 //    printf("%f %f %f %f %f\n", a,b,c,d,dist);
1338     if (dist<=1.0) return ((d*dist + c)*dist + b)*dist +a;
1339     else           return getSplineCoeff(        0.0,
1340                                           b+ 2.0*c + 3.0*d,
1341                                                  c + 3.0*d,
1342                                          -b- 3.0*c - 6.0*d,
1343                                          dist-1.0);
1344 }
1345
1346 static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
1347                              int srcW, int dstW, int filterAlign, int one, int flags,
1348                              SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
1349 {
1350     int i;
1351     int filterSize;
1352     int filter2Size;
1353     int minFilterSize;
1354     int64_t *filter=NULL;
1355     int64_t *filter2=NULL;
1356     const int64_t fone= 1LL<<54;
1357     int ret= -1;
1358 #if ARCH_X86
1359     if (flags & SWS_CPU_CAPS_MMX)
1360         __asm__ volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions)
1361 #endif
1362
1363     // NOTE: the +1 is for the MMX scaler which reads over the end
1364     FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW+1)*sizeof(int16_t), fail);
1365
1366     if (FFABS(xInc - 0x10000) <10) { // unscaled
1367         int i;
1368         filterSize= 1;
1369         FF_ALLOCZ_OR_GOTO(NULL, filter, dstW*sizeof(*filter)*filterSize, fail);
1370
1371         for (i=0; i<dstW; i++) {
1372             filter[i*filterSize]= fone;
1373             (*filterPos)[i]=i;
1374         }
1375
1376     } else if (flags&SWS_POINT) { // lame looking point sampling mode
1377         int i;
1378         int xDstInSrc;
1379         filterSize= 1;
1380         FF_ALLOC_OR_GOTO(NULL, filter, dstW*sizeof(*filter)*filterSize, fail);
1381
1382         xDstInSrc= xInc/2 - 0x8000;
1383         for (i=0; i<dstW; i++) {
1384             int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
1385
1386             (*filterPos)[i]= xx;
1387             filter[i]= fone;
1388             xDstInSrc+= xInc;
1389         }
1390     } else if ((xInc <= (1<<16) && (flags&SWS_AREA)) || (flags&SWS_FAST_BILINEAR)) { // bilinear upscale
1391         int i;
1392         int xDstInSrc;
1393         filterSize= 2;
1394         FF_ALLOC_OR_GOTO(NULL, filter, dstW*sizeof(*filter)*filterSize, fail);
1395
1396         xDstInSrc= xInc/2 - 0x8000;
1397         for (i=0; i<dstW; i++) {
1398             int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
1399             int j;
1400
1401             (*filterPos)[i]= xx;
1402             //bilinear upscale / linear interpolate / area averaging
1403             for (j=0; j<filterSize; j++) {
1404                 int64_t coeff= fone - FFABS((xx<<16) - xDstInSrc)*(fone>>16);
1405                 if (coeff<0) coeff=0;
1406                 filter[i*filterSize + j]= coeff;
1407                 xx++;
1408             }
1409             xDstInSrc+= xInc;
1410         }
1411     } else {
1412         int xDstInSrc;
1413         int sizeFactor;
1414
1415         if      (flags&SWS_BICUBIC)      sizeFactor=  4;
1416         else if (flags&SWS_X)            sizeFactor=  8;
1417         else if (flags&SWS_AREA)         sizeFactor=  1; //downscale only, for upscale it is bilinear
1418         else if (flags&SWS_GAUSS)        sizeFactor=  8;   // infinite ;)
1419         else if (flags&SWS_LANCZOS)      sizeFactor= param[0] != SWS_PARAM_DEFAULT ? ceil(2*param[0]) : 6;
1420         else if (flags&SWS_SINC)         sizeFactor= 20; // infinite ;)
1421         else if (flags&SWS_SPLINE)       sizeFactor= 20;  // infinite ;)
1422         else if (flags&SWS_BILINEAR)     sizeFactor=  2;
1423         else {
1424             sizeFactor= 0; //GCC warning killer
1425             assert(0);
1426         }
1427
1428         if (xInc <= 1<<16)      filterSize= 1 + sizeFactor; // upscale
1429         else                    filterSize= 1 + (sizeFactor*srcW + dstW - 1)/ dstW;
1430
1431         if (filterSize > srcW-2) filterSize=srcW-2;
1432
1433         FF_ALLOC_OR_GOTO(NULL, filter, dstW*sizeof(*filter)*filterSize, fail);
1434
1435         xDstInSrc= xInc - 0x10000;
1436         for (i=0; i<dstW; i++) {
1437             int xx= (xDstInSrc - ((filterSize-2)<<16)) / (1<<17);
1438             int j;
1439             (*filterPos)[i]= xx;
1440             for (j=0; j<filterSize; j++) {
1441                 int64_t d= ((int64_t)FFABS((xx<<17) - xDstInSrc))<<13;
1442                 double floatd;
1443                 int64_t coeff;
1444
1445                 if (xInc > 1<<16)
1446                     d= d*dstW/srcW;
1447                 floatd= d * (1.0/(1<<30));
1448
1449                 if (flags & SWS_BICUBIC) {
1450                     int64_t B= (param[0] != SWS_PARAM_DEFAULT ? param[0] :   0) * (1<<24);
1451                     int64_t C= (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1<<24);
1452                     int64_t dd = ( d*d)>>30;
1453                     int64_t ddd= (dd*d)>>30;
1454
1455                     if      (d < 1LL<<30)
1456                         coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30);
1457                     else if (d < 1LL<<31)
1458                         coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30);
1459                     else
1460                         coeff=0.0;
1461                     coeff *= fone>>(30+24);
1462                 }
1463 /*                else if (flags & SWS_X) {
1464                     double p= param ? param*0.01 : 0.3;
1465                     coeff = d ? sin(d*PI)/(d*PI) : 1.0;
1466                     coeff*= pow(2.0, - p*d*d);
1467                 }*/
1468                 else if (flags & SWS_X) {
1469                     double A= param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
1470                     double c;
1471
1472                     if (floatd<1.0)
1473                         c = cos(floatd*PI);
1474                     else
1475                         c=-1.0;
1476                     if (c<0.0)      c= -pow(-c, A);
1477                     else            c=  pow( c, A);
1478                     coeff= (c*0.5 + 0.5)*fone;
1479                 } else if (flags & SWS_AREA) {
1480                     int64_t d2= d - (1<<29);
1481                     if      (d2*xInc < -(1LL<<(29+16))) coeff= 1.0 * (1LL<<(30+16));
1482                     else if (d2*xInc <  (1LL<<(29+16))) coeff= -d2*xInc + (1LL<<(29+16));
1483                     else coeff=0.0;
1484                     coeff *= fone>>(30+16);
1485                 } else if (flags & SWS_GAUSS) {
1486                     double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
1487                     coeff = (pow(2.0, - p*floatd*floatd))*fone;
1488                 } else if (flags & SWS_SINC) {
1489                     coeff = (d ? sin(floatd*PI)/(floatd*PI) : 1.0)*fone;
1490                 } else if (flags & SWS_LANCZOS) {
1491                     double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
1492                     coeff = (d ? sin(floatd*PI)*sin(floatd*PI/p)/(floatd*floatd*PI*PI/p) : 1.0)*fone;
1493                     if (floatd>p) coeff=0;
1494                 } else if (flags & SWS_BILINEAR) {
1495                     coeff= (1<<30) - d;
1496                     if (coeff<0) coeff=0;
1497                     coeff *= fone >> 30;
1498                 } else if (flags & SWS_SPLINE) {
1499                     double p=-2.196152422706632;
1500                     coeff = getSplineCoeff(1.0, 0.0, p, -p-1.0, floatd) * fone;
1501                 } else {
1502                     coeff= 0.0; //GCC warning killer
1503                     assert(0);
1504                 }
1505
1506                 filter[i*filterSize + j]= coeff;
1507                 xx++;
1508             }
1509             xDstInSrc+= 2*xInc;
1510         }
1511     }
1512
1513     /* apply src & dst Filter to filter -> filter2
1514        av_free(filter);
1515     */
1516     assert(filterSize>0);
1517     filter2Size= filterSize;
1518     if (srcFilter) filter2Size+= srcFilter->length - 1;
1519     if (dstFilter) filter2Size+= dstFilter->length - 1;
1520     assert(filter2Size>0);
1521     FF_ALLOCZ_OR_GOTO(NULL, filter2, filter2Size*dstW*sizeof(*filter2), fail);
1522
1523     for (i=0; i<dstW; i++) {
1524         int j, k;
1525
1526         if(srcFilter) {
1527             for (k=0; k<srcFilter->length; k++) {
1528                 for (j=0; j<filterSize; j++)
1529                     filter2[i*filter2Size + k + j] += srcFilter->coeff[k]*filter[i*filterSize + j];
1530             }
1531         } else {
1532             for (j=0; j<filterSize; j++)
1533                 filter2[i*filter2Size + j]= filter[i*filterSize + j];
1534         }
1535         //FIXME dstFilter
1536
1537         (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
1538     }
1539     av_freep(&filter);
1540
1541     /* try to reduce the filter-size (step1 find size and shift left) */
1542     // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
1543     minFilterSize= 0;
1544     for (i=dstW-1; i>=0; i--) {
1545         int min= filter2Size;
1546         int j;
1547         int64_t cutOff=0.0;
1548
1549         /* get rid off near zero elements on the left by shifting left */
1550         for (j=0; j<filter2Size; j++) {
1551             int k;
1552             cutOff += FFABS(filter2[i*filter2Size]);
1553
1554             if (cutOff > SWS_MAX_REDUCE_CUTOFF*fone) break;
1555
1556             /* preserve monotonicity because the core can't handle the filter otherwise */
1557             if (i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
1558
1559             // move filter coefficients left
1560             for (k=1; k<filter2Size; k++)
1561                 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
1562             filter2[i*filter2Size + k - 1]= 0;
1563             (*filterPos)[i]++;
1564         }
1565
1566         cutOff=0;
1567         /* count near zeros on the right */
1568         for (j=filter2Size-1; j>0; j--) {
1569             cutOff += FFABS(filter2[i*filter2Size + j]);
1570
1571             if (cutOff > SWS_MAX_REDUCE_CUTOFF*fone) break;
1572             min--;
1573         }
1574
1575         if (min>minFilterSize) minFilterSize= min;
1576     }
1577
1578     if (flags & SWS_CPU_CAPS_ALTIVEC) {
1579         // we can handle the special case 4,
1580         // so we don't want to go to the full 8
1581         if (minFilterSize < 5)
1582             filterAlign = 4;
1583
1584         // We really don't want to waste our time
1585         // doing useless computation, so fall back on
1586         // the scalar C code for very small filters.
1587         // Vectorizing is worth it only if you have a
1588         // decent-sized vector.
1589         if (minFilterSize < 3)
1590             filterAlign = 1;
1591     }
1592
1593     if (flags & SWS_CPU_CAPS_MMX) {
1594         // special case for unscaled vertical filtering
1595         if (minFilterSize == 1 && filterAlign == 2)
1596             filterAlign= 1;
1597     }
1598
1599     assert(minFilterSize > 0);
1600     filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
1601     assert(filterSize > 0);
1602     filter= av_malloc(filterSize*dstW*sizeof(*filter));
1603     if (filterSize >= MAX_FILTER_SIZE*16/((flags&SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter)
1604         goto fail;
1605     *outFilterSize= filterSize;
1606
1607     if (flags&SWS_PRINT_INFO)
1608         av_log(NULL, AV_LOG_VERBOSE, "SwScaler: reducing / aligning filtersize %d -> %d\n", filter2Size, filterSize);
1609     /* try to reduce the filter-size (step2 reduce it) */
1610     for (i=0; i<dstW; i++) {
1611         int j;
1612
1613         for (j=0; j<filterSize; j++) {
1614             if (j>=filter2Size) filter[i*filterSize + j]= 0;
1615             else               filter[i*filterSize + j]= filter2[i*filter2Size + j];
1616             if((flags & SWS_BITEXACT) && j>=minFilterSize)
1617                 filter[i*filterSize + j]= 0;
1618         }
1619     }
1620
1621
1622     //FIXME try to align filterPos if possible
1623
1624     //fix borders
1625     for (i=0; i<dstW; i++) {
1626         int j;
1627         if ((*filterPos)[i] < 0) {
1628             // move filter coefficients left to compensate for filterPos
1629             for (j=1; j<filterSize; j++) {
1630                 int left= FFMAX(j + (*filterPos)[i], 0);
1631                 filter[i*filterSize + left] += filter[i*filterSize + j];
1632                 filter[i*filterSize + j]=0;
1633             }
1634             (*filterPos)[i]= 0;
1635         }
1636
1637         if ((*filterPos)[i] + filterSize > srcW) {
1638             int shift= (*filterPos)[i] + filterSize - srcW;
1639             // move filter coefficients right to compensate for filterPos
1640             for (j=filterSize-2; j>=0; j--) {
1641                 int right= FFMIN(j + shift, filterSize-1);
1642                 filter[i*filterSize +right] += filter[i*filterSize +j];
1643                 filter[i*filterSize +j]=0;
1644             }
1645             (*filterPos)[i]= srcW - filterSize;
1646         }
1647     }
1648
1649     // Note the +1 is for the MMX scaler which reads over the end
1650     /* align at 16 for AltiVec (needed by hScale_altivec_real) */
1651     FF_ALLOCZ_OR_GOTO(NULL, *outFilter, *outFilterSize*(dstW+1)*sizeof(int16_t), fail);
1652
1653     /* normalize & store in outFilter */
1654     for (i=0; i<dstW; i++) {
1655         int j;
1656         int64_t error=0;
1657         int64_t sum=0;
1658
1659         for (j=0; j<filterSize; j++) {
1660             sum+= filter[i*filterSize + j];
1661         }
1662         sum= (sum + one/2)/ one;
1663         for (j=0; j<*outFilterSize; j++) {
1664             int64_t v= filter[i*filterSize + j] + error;
1665             int intV= ROUNDED_DIV(v, sum);
1666             (*outFilter)[i*(*outFilterSize) + j]= intV;
1667             error= v - intV*sum;
1668         }
1669     }
1670
1671     (*filterPos)[dstW]= (*filterPos)[dstW-1]; // the MMX scaler will read over the end
1672     for (i=0; i<*outFilterSize; i++) {
1673         int j= dstW*(*outFilterSize);
1674         (*outFilter)[j + i]= (*outFilter)[j + i - (*outFilterSize)];
1675     }
1676
1677     ret=0;
1678 fail:
1679     av_free(filter);
1680     av_free(filter2);
1681     return ret;
1682 }
1683
1684 #ifdef COMPILE_MMX2
1685 static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits)
1686 {
1687     uint8_t *fragmentA;
1688     x86_reg imm8OfPShufW1A;
1689     x86_reg imm8OfPShufW2A;
1690     x86_reg fragmentLengthA;
1691     uint8_t *fragmentB;
1692     x86_reg imm8OfPShufW1B;
1693     x86_reg imm8OfPShufW2B;
1694     x86_reg fragmentLengthB;
1695     int fragmentPos;
1696
1697     int xpos, i;
1698
1699     // create an optimized horizontal scaling routine
1700     /* This scaler is made of runtime-generated MMX2 code using specially
1701      * tuned pshufw instructions. For every four output pixels, if four
1702      * input pixels are enough for the fast bilinear scaling, then a chunk
1703      * of fragmentB is used. If five input pixels are needed, then a chunk
1704      * of fragmentA is used.
1705      */
1706
1707     //code fragment
1708
1709     __asm__ volatile(
1710         "jmp                         9f                 \n\t"
1711     // Begin
1712         "0:                                             \n\t"
1713         "movq    (%%"REG_d", %%"REG_a"), %%mm3          \n\t"
1714         "movd    (%%"REG_c", %%"REG_S"), %%mm0          \n\t"
1715         "movd   1(%%"REG_c", %%"REG_S"), %%mm1          \n\t"
1716         "punpcklbw                %%mm7, %%mm1          \n\t"
1717         "punpcklbw                %%mm7, %%mm0          \n\t"
1718         "pshufw                   $0xFF, %%mm1, %%mm1   \n\t"
1719         "1:                                             \n\t"
1720         "pshufw                   $0xFF, %%mm0, %%mm0   \n\t"
1721         "2:                                             \n\t"
1722         "psubw                    %%mm1, %%mm0          \n\t"
1723         "movl   8(%%"REG_b", %%"REG_a"), %%esi          \n\t"
1724         "pmullw                   %%mm3, %%mm0          \n\t"
1725         "psllw                       $7, %%mm1          \n\t"
1726         "paddw                    %%mm1, %%mm0          \n\t"
1727
1728         "movq                     %%mm0, (%%"REG_D", %%"REG_a") \n\t"
1729
1730         "add                         $8, %%"REG_a"      \n\t"
1731     // End
1732         "9:                                             \n\t"
1733 //        "int $3                                         \n\t"
1734         "lea                 " LOCAL_MANGLE(0b) ", %0   \n\t"
1735         "lea                 " LOCAL_MANGLE(1b) ", %1   \n\t"
1736         "lea                 " LOCAL_MANGLE(2b) ", %2   \n\t"
1737         "dec                         %1                 \n\t"
1738         "dec                         %2                 \n\t"
1739         "sub                         %0, %1             \n\t"
1740         "sub                         %0, %2             \n\t"
1741         "lea                 " LOCAL_MANGLE(9b) ", %3   \n\t"
1742         "sub                         %0, %3             \n\t"
1743
1744
1745         :"=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
1746         "=r" (fragmentLengthA)
1747     );
1748
1749     __asm__ volatile(
1750         "jmp                         9f                 \n\t"
1751     // Begin
1752         "0:                                             \n\t"
1753         "movq    (%%"REG_d", %%"REG_a"), %%mm3          \n\t"
1754         "movd    (%%"REG_c", %%"REG_S"), %%mm0          \n\t"
1755         "punpcklbw                %%mm7, %%mm0          \n\t"
1756         "pshufw                   $0xFF, %%mm0, %%mm1   \n\t"
1757         "1:                                             \n\t"
1758         "pshufw                   $0xFF, %%mm0, %%mm0   \n\t"
1759         "2:                                             \n\t"
1760         "psubw                    %%mm1, %%mm0          \n\t"
1761         "movl   8(%%"REG_b", %%"REG_a"), %%esi          \n\t"
1762         "pmullw                   %%mm3, %%mm0          \n\t"
1763         "psllw                       $7, %%mm1          \n\t"
1764         "paddw                    %%mm1, %%mm0          \n\t"
1765
1766         "movq                     %%mm0, (%%"REG_D", %%"REG_a") \n\t"
1767
1768         "add                         $8, %%"REG_a"      \n\t"
1769     // End
1770         "9:                                             \n\t"
1771 //        "int                       $3                   \n\t"
1772         "lea                 " LOCAL_MANGLE(0b) ", %0   \n\t"
1773         "lea                 " LOCAL_MANGLE(1b) ", %1   \n\t"
1774         "lea                 " LOCAL_MANGLE(2b) ", %2   \n\t"
1775         "dec                         %1                 \n\t"
1776         "dec                         %2                 \n\t"
1777         "sub                         %0, %1             \n\t"
1778         "sub                         %0, %2             \n\t"
1779         "lea                 " LOCAL_MANGLE(9b) ", %3   \n\t"
1780         "sub                         %0, %3             \n\t"
1781
1782
1783         :"=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
1784         "=r" (fragmentLengthB)
1785     );
1786
1787     xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
1788     fragmentPos=0;
1789
1790     for (i=0; i<dstW/numSplits; i++) {
1791         int xx=xpos>>16;
1792
1793         if ((i&3) == 0) {
1794             int a=0;
1795             int b=((xpos+xInc)>>16) - xx;
1796             int c=((xpos+xInc*2)>>16) - xx;
1797             int d=((xpos+xInc*3)>>16) - xx;
1798             int inc                = (d+1<4);
1799             uint8_t *fragment      = (d+1<4) ? fragmentB       : fragmentA;
1800             x86_reg imm8OfPShufW1  = (d+1<4) ? imm8OfPShufW1B  : imm8OfPShufW1A;
1801             x86_reg imm8OfPShufW2  = (d+1<4) ? imm8OfPShufW2B  : imm8OfPShufW2A;
1802             x86_reg fragmentLength = (d+1<4) ? fragmentLengthB : fragmentLengthA;
1803             int maxShift= 3-(d+inc);
1804             int shift=0;
1805
1806             if (filterCode) {
1807                 filter[i  ] = (( xpos         & 0xFFFF) ^ 0xFFFF)>>9;
1808                 filter[i+1] = (((xpos+xInc  ) & 0xFFFF) ^ 0xFFFF)>>9;
1809                 filter[i+2] = (((xpos+xInc*2) & 0xFFFF) ^ 0xFFFF)>>9;
1810                 filter[i+3] = (((xpos+xInc*3) & 0xFFFF) ^ 0xFFFF)>>9;
1811                 filterPos[i/2]= xx;
1812
1813                 memcpy(filterCode + fragmentPos, fragment, fragmentLength);
1814
1815                 filterCode[fragmentPos + imm8OfPShufW1]=
1816                     (a+inc) | ((b+inc)<<2) | ((c+inc)<<4) | ((d+inc)<<6);
1817                 filterCode[fragmentPos + imm8OfPShufW2]=
1818                     a | (b<<2) | (c<<4) | (d<<6);
1819
1820                 if (i+4-inc>=dstW) shift=maxShift; //avoid overread
1821                 else if ((filterPos[i/2]&3) <= maxShift) shift=filterPos[i/2]&3; //Align
1822
1823                 if (shift && i>=shift) {
1824                     filterCode[fragmentPos + imm8OfPShufW1]+= 0x55*shift;
1825                     filterCode[fragmentPos + imm8OfPShufW2]+= 0x55*shift;
1826                     filterPos[i/2]-=shift;
1827                 }
1828             }
1829
1830             fragmentPos+= fragmentLength;
1831
1832             if (filterCode)
1833                 filterCode[fragmentPos]= RET;
1834         }
1835         xpos+=xInc;
1836     }
1837     if (filterCode)
1838         filterPos[((i/2)+1)&(~1)]= xpos>>16; // needed to jump to the next part
1839
1840     return fragmentPos + 1;
1841 }
1842 #endif /* COMPILE_MMX2 */
1843
1844 static void globalInit(void)
1845 {
1846     // generating tables:
1847     int i;
1848     for (i=0; i<768; i++) {
1849         int c= av_clip_uint8(i-256);
1850         clip_table[i]=c;
1851     }
1852 }
1853
1854 static SwsFunc getSwsFunc(SwsContext *c)
1855 {
1856 #if CONFIG_RUNTIME_CPUDETECT
1857     int flags = c->flags;
1858
1859 #if ARCH_X86 && CONFIG_GPL
1860     // ordered per speed fastest first
1861     if (flags & SWS_CPU_CAPS_MMX2) {
1862         sws_init_swScale_MMX2(c);
1863         return swScale_MMX2;
1864     } else if (flags & SWS_CPU_CAPS_3DNOW) {
1865         sws_init_swScale_3DNow(c);
1866         return swScale_3DNow;
1867     } else if (flags & SWS_CPU_CAPS_MMX) {
1868         sws_init_swScale_MMX(c);
1869         return swScale_MMX;
1870     } else {
1871         sws_init_swScale_C(c);
1872         return swScale_C;
1873     }
1874
1875 #else
1876 #if ARCH_PPC
1877     if (flags & SWS_CPU_CAPS_ALTIVEC) {
1878         sws_init_swScale_altivec(c);
1879         return swScale_altivec;
1880     } else {
1881         sws_init_swScale_C(c);
1882         return swScale_C;
1883     }
1884 #endif
1885     sws_init_swScale_C(c);
1886     return swScale_C;
1887 #endif /* ARCH_X86 && CONFIG_GPL */
1888 #else //CONFIG_RUNTIME_CPUDETECT
1889 #if   COMPILE_TEMPLATE_MMX2
1890     sws_init_swScale_MMX2(c);
1891     return swScale_MMX2;
1892 #elif COMPILE_TEMPLATE_AMD3DNOW
1893     sws_init_swScale_3DNow(c);
1894     return swScale_3DNow;
1895 #elif COMPILE_TEMPLATE_MMX
1896     sws_init_swScale_MMX(c);
1897     return swScale_MMX;
1898 #elif COMPILE_TEMPLATE_ALTIVEC
1899     sws_init_swScale_altivec(c);
1900     return swScale_altivec;
1901 #else
1902     sws_init_swScale_C(c);
1903     return swScale_C;
1904 #endif
1905 #endif //!CONFIG_RUNTIME_CPUDETECT
1906 }
1907
1908 static int PlanarToNV12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1909                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
1910 {
1911     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1912     /* Copy Y plane */
1913     if (dstStride[0]==srcStride[0] && srcStride[0] > 0)
1914         memcpy(dst, src[0], srcSliceH*dstStride[0]);
1915     else {
1916         int i;
1917         const uint8_t *srcPtr= src[0];
1918         uint8_t *dstPtr= dst;
1919         for (i=0; i<srcSliceH; i++) {
1920             memcpy(dstPtr, srcPtr, c->srcW);
1921             srcPtr+= srcStride[0];
1922             dstPtr+= dstStride[0];
1923         }
1924     }
1925     dst = dstParam[1] + dstStride[1]*srcSliceY/2;
1926     if (c->dstFormat == PIX_FMT_NV12)
1927         interleaveBytes(src[1], src[2], dst, c->srcW/2, srcSliceH/2, srcStride[1], srcStride[2], dstStride[0]);
1928     else
1929         interleaveBytes(src[2], src[1], dst, c->srcW/2, srcSliceH/2, srcStride[2], srcStride[1], dstStride[0]);
1930
1931     return srcSliceH;
1932 }
1933
1934 static int PlanarToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1935                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
1936 {
1937     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1938
1939     yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
1940
1941     return srcSliceH;
1942 }
1943
1944 static int PlanarToUyvyWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1945                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
1946 {
1947     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1948
1949     yv12touyvy(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
1950
1951     return srcSliceH;
1952 }
1953
1954 static int YUV422PToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1955                                 int srcSliceH, uint8_t* dstParam[], int dstStride[])
1956 {
1957     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1958
1959     yuv422ptoyuy2(src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0]);
1960
1961     return srcSliceH;
1962 }
1963
1964 static int YUV422PToUyvyWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1965                                 int srcSliceH, uint8_t* dstParam[], int dstStride[])
1966 {
1967     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
1968
1969     yuv422ptouyvy(src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0]);
1970
1971     return srcSliceH;
1972 }
1973
1974 static int YUYV2YUV420Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1975                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
1976 {
1977     uint8_t *ydst=dstParam[0] + dstStride[0]*srcSliceY;
1978     uint8_t *udst=dstParam[1] + dstStride[1]*srcSliceY/2;
1979     uint8_t *vdst=dstParam[2] + dstStride[2]*srcSliceY/2;
1980
1981     yuyvtoyuv420(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
1982
1983     if (dstParam[3])
1984         fillPlane(dstParam[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
1985
1986     return srcSliceH;
1987 }
1988
1989 static int YUYV2YUV422Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
1990                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
1991 {
1992     uint8_t *ydst=dstParam[0] + dstStride[0]*srcSliceY;
1993     uint8_t *udst=dstParam[1] + dstStride[1]*srcSliceY;
1994     uint8_t *vdst=dstParam[2] + dstStride[2]*srcSliceY;
1995
1996     yuyvtoyuv422(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
1997
1998     return srcSliceH;
1999 }
2000
2001 static int UYVY2YUV420Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2002                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2003 {
2004     uint8_t *ydst=dstParam[0] + dstStride[0]*srcSliceY;
2005     uint8_t *udst=dstParam[1] + dstStride[1]*srcSliceY/2;
2006     uint8_t *vdst=dstParam[2] + dstStride[2]*srcSliceY/2;
2007
2008     uyvytoyuv420(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
2009
2010     if (dstParam[3])
2011         fillPlane(dstParam[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
2012
2013     return srcSliceH;
2014 }
2015
2016 static int UYVY2YUV422Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2017                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2018 {
2019     uint8_t *ydst=dstParam[0] + dstStride[0]*srcSliceY;
2020     uint8_t *udst=dstParam[1] + dstStride[1]*srcSliceY;
2021     uint8_t *vdst=dstParam[2] + dstStride[2]*srcSliceY;
2022
2023     uyvytoyuv422(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
2024
2025     return srcSliceH;
2026 }
2027
2028 static int pal2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2029                           int srcSliceH, uint8_t* dst[], int dstStride[])
2030 {
2031     const enum PixelFormat srcFormat= c->srcFormat;
2032     const enum PixelFormat dstFormat= c->dstFormat;
2033     void (*conv)(const uint8_t *src, uint8_t *dst, long num_pixels,
2034                  const uint8_t *palette)=NULL;
2035     int i;
2036     uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
2037     uint8_t *srcPtr= src[0];
2038
2039     if (!usePal(srcFormat))
2040         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
2041                sws_format_name(srcFormat), sws_format_name(dstFormat));
2042
2043     switch(dstFormat) {
2044     case PIX_FMT_RGB32  : conv = palette8topacked32; break;
2045     case PIX_FMT_BGR32  : conv = palette8topacked32; break;
2046     case PIX_FMT_BGR32_1: conv = palette8topacked32; break;
2047     case PIX_FMT_RGB32_1: conv = palette8topacked32; break;
2048     case PIX_FMT_RGB24  : conv = palette8topacked24; break;
2049     case PIX_FMT_BGR24  : conv = palette8topacked24; break;
2050     default: av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
2051                     sws_format_name(srcFormat), sws_format_name(dstFormat)); break;
2052     }
2053
2054
2055     for (i=0; i<srcSliceH; i++) {
2056         conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb);
2057         srcPtr+= srcStride[0];
2058         dstPtr+= dstStride[0];
2059     }
2060
2061     return srcSliceH;
2062 }
2063
2064 /* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
2065 static int rgb2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2066                           int srcSliceH, uint8_t* dst[], int dstStride[])
2067 {
2068     const enum PixelFormat srcFormat= c->srcFormat;
2069     const enum PixelFormat dstFormat= c->dstFormat;
2070     const int srcBpp= (fmt_depth(srcFormat) + 7) >> 3;
2071     const int dstBpp= (fmt_depth(dstFormat) + 7) >> 3;
2072     const int srcId= fmt_depth(srcFormat) >> 2; /* 1:0, 4:1, 8:2, 15:3, 16:4, 24:6, 32:8 */
2073     const int dstId= fmt_depth(dstFormat) >> 2;
2074     void (*conv)(const uint8_t *src, uint8_t *dst, long src_size)=NULL;
2075
2076     /* BGR -> BGR */
2077     if (  (isBGR(srcFormat) && isBGR(dstFormat))
2078        || (isRGB(srcFormat) && isRGB(dstFormat))) {
2079         switch(srcId | (dstId<<4)) {
2080         case 0x34: conv= rgb16to15; break;
2081         case 0x36: conv= rgb24to15; break;
2082         case 0x38: conv= rgb32to15; break;
2083         case 0x43: conv= rgb15to16; break;
2084         case 0x46: conv= rgb24to16; break;
2085         case 0x48: conv= rgb32to16; break;
2086         case 0x63: conv= rgb15to24; break;
2087         case 0x64: conv= rgb16to24; break;
2088         case 0x68: conv= rgb32to24; break;
2089         case 0x83: conv= rgb15to32; break;
2090         case 0x84: conv= rgb16to32; break;
2091         case 0x86: conv= rgb24to32; break;
2092         default: av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
2093                         sws_format_name(srcFormat), sws_format_name(dstFormat)); break;
2094         }
2095     } else if (  (isBGR(srcFormat) && isRGB(dstFormat))
2096              || (isRGB(srcFormat) && isBGR(dstFormat))) {
2097         switch(srcId | (dstId<<4)) {
2098         case 0x33: conv= rgb15tobgr15; break;
2099         case 0x34: conv= rgb16tobgr15; break;
2100         case 0x36: conv= rgb24tobgr15; break;
2101         case 0x38: conv= rgb32tobgr15; break;
2102         case 0x43: conv= rgb15tobgr16; break;
2103         case 0x44: conv= rgb16tobgr16; break;
2104         case 0x46: conv= rgb24tobgr16; break;
2105         case 0x48: conv= rgb32tobgr16; break;
2106         case 0x63: conv= rgb15tobgr24; break;
2107         case 0x64: conv= rgb16tobgr24; break;
2108         case 0x66: conv= rgb24tobgr24; break;
2109         case 0x68: conv= rgb32tobgr24; break;
2110         case 0x83: conv= rgb15tobgr32; break;
2111         case 0x84: conv= rgb16tobgr32; break;
2112         case 0x86: conv= rgb24tobgr32; break;
2113         case 0x88: conv= rgb32tobgr32; break;
2114         default: av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
2115                         sws_format_name(srcFormat), sws_format_name(dstFormat)); break;
2116         }
2117     } else {
2118         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
2119                sws_format_name(srcFormat), sws_format_name(dstFormat));
2120     }
2121
2122     if(conv) {
2123         uint8_t *srcPtr= src[0];
2124         if(srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2125             srcPtr += ALT32_CORR;
2126
2127         if (dstStride[0]*srcBpp == srcStride[0]*dstBpp && srcStride[0] > 0)
2128             conv(srcPtr, dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
2129         else {
2130             int i;
2131             uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
2132
2133             for (i=0; i<srcSliceH; i++) {
2134                 conv(srcPtr, dstPtr, c->srcW*srcBpp);
2135                 srcPtr+= srcStride[0];
2136                 dstPtr+= dstStride[0];
2137             }
2138         }
2139     }
2140     return srcSliceH;
2141 }
2142
2143 static int bgr24toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2144                               int srcSliceH, uint8_t* dst[], int dstStride[])
2145 {
2146
2147     rgb24toyv12(
2148         src[0],
2149         dst[0]+ srcSliceY    *dstStride[0],
2150         dst[1]+(srcSliceY>>1)*dstStride[1],
2151         dst[2]+(srcSliceY>>1)*dstStride[2],
2152         c->srcW, srcSliceH,
2153         dstStride[0], dstStride[1], srcStride[0]);
2154     if (dst[3])
2155         fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
2156     return srcSliceH;
2157 }
2158
2159 static int yvu9toyv12Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2160                              int srcSliceH, uint8_t* dst[], int dstStride[])
2161 {
2162     int i;
2163
2164     /* copy Y */
2165     if (srcStride[0]==dstStride[0] && srcStride[0] > 0)
2166         memcpy(dst[0]+ srcSliceY*dstStride[0], src[0], srcStride[0]*srcSliceH);
2167     else {
2168         uint8_t *srcPtr= src[0];
2169         uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
2170
2171         for (i=0; i<srcSliceH; i++) {
2172             memcpy(dstPtr, srcPtr, c->srcW);
2173             srcPtr+= srcStride[0];
2174             dstPtr+= dstStride[0];
2175         }
2176     }
2177
2178     if (c->dstFormat==PIX_FMT_YUV420P || c->dstFormat==PIX_FMT_YUVA420P) {
2179         planar2x(src[1], dst[1] + dstStride[1]*(srcSliceY >> 1), c->chrSrcW,
2180                  srcSliceH >> 2, srcStride[1], dstStride[1]);
2181         planar2x(src[2], dst[2] + dstStride[2]*(srcSliceY >> 1), c->chrSrcW,
2182                  srcSliceH >> 2, srcStride[2], dstStride[2]);
2183     } else {
2184         planar2x(src[1], dst[2] + dstStride[2]*(srcSliceY >> 1), c->chrSrcW,
2185                  srcSliceH >> 2, srcStride[1], dstStride[2]);
2186         planar2x(src[2], dst[1] + dstStride[1]*(srcSliceY >> 1), c->chrSrcW,
2187                  srcSliceH >> 2, srcStride[2], dstStride[1]);
2188     }
2189     if (dst[3])
2190         fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
2191     return srcSliceH;
2192 }
2193
2194 /* unscaled copy like stuff (assumes nearly identical formats) */
2195 static int packedCopy(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2196                       int srcSliceH, uint8_t* dst[], int dstStride[])
2197 {
2198     if (dstStride[0]==srcStride[0] && srcStride[0] > 0)
2199         memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
2200     else {
2201         int i;
2202         uint8_t *srcPtr= src[0];
2203         uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
2204         int length=0;
2205
2206         /* universal length finder */
2207         while(length+c->srcW <= FFABS(dstStride[0])
2208            && length+c->srcW <= FFABS(srcStride[0])) length+= c->srcW;
2209         assert(length!=0);
2210
2211         for (i=0; i<srcSliceH; i++) {
2212             memcpy(dstPtr, srcPtr, length);
2213             srcPtr+= srcStride[0];
2214             dstPtr+= dstStride[0];
2215         }
2216     }
2217     return srcSliceH;
2218 }
2219
2220 static int planarCopy(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2221                       int srcSliceH, uint8_t* dst[], int dstStride[])
2222 {
2223     int plane, i, j;
2224     for (plane=0; plane<4; plane++) {
2225         int length= (plane==0 || plane==3) ? c->srcW  : -((-c->srcW  )>>c->chrDstHSubSample);
2226         int y=      (plane==0 || plane==3) ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
2227         int height= (plane==0 || plane==3) ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
2228         uint8_t *srcPtr= src[plane];
2229         uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
2230
2231         if (!dst[plane]) continue;
2232         // ignore palette for GRAY8
2233         if (plane == 1 && !dst[2]) continue;
2234         if (!src[plane] || (plane == 1 && !src[2])) {
2235             if(is16BPS(c->dstFormat))
2236                 length*=2;
2237             fillPlane(dst[plane], dstStride[plane], length, height, y, (plane==3) ? 255 : 128);
2238         } else {
2239             if(is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
2240                 if (!isBE(c->srcFormat)) srcPtr++;
2241                 for (i=0; i<height; i++) {
2242                     for (j=0; j<length; j++) dstPtr[j] = srcPtr[j<<1];
2243                     srcPtr+= srcStride[plane];
2244                     dstPtr+= dstStride[plane];
2245                 }
2246             } else if(!is16BPS(c->srcFormat) && is16BPS(c->dstFormat)) {
2247                 for (i=0; i<height; i++) {
2248                     for (j=0; j<length; j++) {
2249                         dstPtr[ j<<1   ] = srcPtr[j];
2250                         dstPtr[(j<<1)+1] = srcPtr[j];
2251                     }
2252                     srcPtr+= srcStride[plane];
2253                     dstPtr+= dstStride[plane];
2254                 }
2255             } else if(is16BPS(c->srcFormat) && is16BPS(c->dstFormat)
2256                   && isBE(c->srcFormat) != isBE(c->dstFormat)) {
2257
2258                 for (i=0; i<height; i++) {
2259                     for (j=0; j<length; j++)
2260                         ((uint16_t*)dstPtr)[j] = bswap_16(((uint16_t*)srcPtr)[j]);
2261                     srcPtr+= srcStride[plane];
2262                     dstPtr+= dstStride[plane];
2263                 }
2264             } else if (dstStride[plane]==srcStride[plane] && srcStride[plane] > 0)
2265                 memcpy(dst[plane] + dstStride[plane]*y, src[plane], height*dstStride[plane]);
2266             else {
2267                 if(is16BPS(c->srcFormat) && is16BPS(c->dstFormat))
2268                     length*=2;
2269                 for (i=0; i<height; i++) {
2270                     memcpy(dstPtr, srcPtr, length);
2271                     srcPtr+= srcStride[plane];
2272                     dstPtr+= dstStride[plane];
2273                 }
2274             }
2275         }
2276     }
2277     return srcSliceH;
2278 }
2279
2280
2281 static void getSubSampleFactors(int *h, int *v, int format)
2282 {
2283     switch(format) {
2284     case PIX_FMT_UYVY422:
2285     case PIX_FMT_YUYV422:
2286         *h=1;
2287         *v=0;
2288         break;
2289     case PIX_FMT_YUV420P:
2290     case PIX_FMT_YUV420P16LE:
2291     case PIX_FMT_YUV420P16BE:
2292     case PIX_FMT_YUVA420P:
2293     case PIX_FMT_GRAY16BE:
2294     case PIX_FMT_GRAY16LE:
2295     case PIX_FMT_GRAY8: //FIXME remove after different subsamplings are fully implemented
2296     case PIX_FMT_NV12:
2297     case PIX_FMT_NV21:
2298         *h=1;
2299         *v=1;
2300         break;
2301     case PIX_FMT_YUV440P:
2302         *h=0;
2303         *v=1;
2304         break;
2305     case PIX_FMT_YUV410P:
2306         *h=2;
2307         *v=2;
2308         break;
2309     case PIX_FMT_YUV444P:
2310     case PIX_FMT_YUV444P16LE:
2311     case PIX_FMT_YUV444P16BE:
2312         *h=0;
2313         *v=0;
2314         break;
2315     case PIX_FMT_YUV422P:
2316     case PIX_FMT_YUV422P16LE:
2317     case PIX_FMT_YUV422P16BE:
2318         *h=1;
2319         *v=0;
2320         break;
2321     case PIX_FMT_YUV411P:
2322         *h=2;
2323         *v=0;
2324         break;
2325     default:
2326         *h=0;
2327         *v=0;
2328         break;
2329     }
2330 }
2331
2332 static uint16_t roundToInt16(int64_t f)
2333 {
2334     int r= (f + (1<<15))>>16;
2335          if (r<-0x7FFF) return 0x8000;
2336     else if (r> 0x7FFF) return 0x7FFF;
2337     else                return r;
2338 }
2339
2340 int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation)
2341 {
2342     int64_t crv =  inv_table[0];
2343     int64_t cbu =  inv_table[1];
2344     int64_t cgu = -inv_table[2];
2345     int64_t cgv = -inv_table[3];
2346     int64_t cy  = 1<<16;
2347     int64_t oy  = 0;
2348
2349     memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
2350     memcpy(c->dstColorspaceTable,     table, sizeof(int)*4);
2351
2352     c->brightness= brightness;
2353     c->contrast  = contrast;
2354     c->saturation= saturation;
2355     c->srcRange  = srcRange;
2356     c->dstRange  = dstRange;
2357     if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
2358
2359     c->uOffset=   0x0400040004000400LL;
2360     c->vOffset=   0x0400040004000400LL;
2361
2362     if (!srcRange) {
2363         cy= (cy*255) / 219;
2364         oy= 16<<16;
2365     } else {
2366         crv= (crv*224) / 255;
2367         cbu= (cbu*224) / 255;
2368         cgu= (cgu*224) / 255;
2369         cgv= (cgv*224) / 255;
2370     }
2371
2372     cy = (cy *contrast             )>>16;
2373     crv= (crv*contrast * saturation)>>32;
2374     cbu= (cbu*contrast * saturation)>>32;
2375     cgu= (cgu*contrast * saturation)>>32;
2376     cgv= (cgv*contrast * saturation)>>32;
2377
2378     oy -= 256*brightness;
2379
2380     c->yCoeff=    roundToInt16(cy *8192) * 0x0001000100010001ULL;
2381     c->vrCoeff=   roundToInt16(crv*8192) * 0x0001000100010001ULL;
2382     c->ubCoeff=   roundToInt16(cbu*8192) * 0x0001000100010001ULL;
2383     c->vgCoeff=   roundToInt16(cgv*8192) * 0x0001000100010001ULL;
2384     c->ugCoeff=   roundToInt16(cgu*8192) * 0x0001000100010001ULL;
2385     c->yOffset=   roundToInt16(oy *   8) * 0x0001000100010001ULL;
2386
2387     c->yuv2rgb_y_coeff  = (int16_t)roundToInt16(cy <<13);
2388     c->yuv2rgb_y_offset = (int16_t)roundToInt16(oy << 9);
2389     c->yuv2rgb_v2r_coeff= (int16_t)roundToInt16(crv<<13);
2390     c->yuv2rgb_v2g_coeff= (int16_t)roundToInt16(cgv<<13);
2391     c->yuv2rgb_u2g_coeff= (int16_t)roundToInt16(cgu<<13);
2392     c->yuv2rgb_u2b_coeff= (int16_t)roundToInt16(cbu<<13);
2393
2394     ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
2395     //FIXME factorize
2396
2397 #ifdef COMPILE_ALTIVEC
2398     if (c->flags & SWS_CPU_CAPS_ALTIVEC)
2399         ff_yuv2rgb_init_tables_altivec(c, inv_table, brightness, contrast, saturation);
2400 #endif
2401     return 0;
2402 }
2403
2404 int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation)
2405 {
2406     if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
2407
2408     *inv_table = c->srcColorspaceTable;
2409     *table     = c->dstColorspaceTable;
2410     *srcRange  = c->srcRange;
2411     *dstRange  = c->dstRange;
2412     *brightness= c->brightness;
2413     *contrast  = c->contrast;
2414     *saturation= c->saturation;
2415
2416     return 0;
2417 }
2418
2419 static int handle_jpeg(enum PixelFormat *format)
2420 {
2421     switch (*format) {
2422     case PIX_FMT_YUVJ420P:
2423         *format = PIX_FMT_YUV420P;
2424         return 1;
2425     case PIX_FMT_YUVJ422P:
2426         *format = PIX_FMT_YUV422P;
2427         return 1;
2428     case PIX_FMT_YUVJ444P:
2429         *format = PIX_FMT_YUV444P;
2430         return 1;
2431     case PIX_FMT_YUVJ440P:
2432         *format = PIX_FMT_YUV440P;
2433         return 1;
2434     default:
2435         return 0;
2436     }
2437 }
2438
2439 SwsContext *sws_getContext(int srcW, int srcH, enum PixelFormat srcFormat, int dstW, int dstH, enum PixelFormat dstFormat, int flags,
2440                            SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param)
2441 {
2442
2443     SwsContext *c;
2444     int i;
2445     int usesVFilter, usesHFilter;
2446     int unscaled, needsDither;
2447     int srcRange, dstRange;
2448     SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
2449 #if ARCH_X86
2450     if (flags & SWS_CPU_CAPS_MMX)
2451         __asm__ volatile("emms\n\t"::: "memory");
2452 #endif
2453
2454 #if !CONFIG_RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off
2455     flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC|SWS_CPU_CAPS_BFIN);
2456 #if   COMPILE_TEMPLATE_MMX2
2457     flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
2458 #elif COMPILE_TEMPLATE_AMD3DNOW
2459     flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
2460 #elif COMPILE_TEMPLATE_MMX
2461     flags |= SWS_CPU_CAPS_MMX;
2462 #elif COMPILE_TEMPLATE_ALTIVEC
2463     flags |= SWS_CPU_CAPS_ALTIVEC;
2464 #elif ARCH_BFIN
2465     flags |= SWS_CPU_CAPS_BFIN;
2466 #endif
2467 #endif /* CONFIG_RUNTIME_CPUDETECT */
2468     if (clip_table[512] != 255) globalInit();
2469     if (!rgb15to16) sws_rgb2rgb_init(flags);
2470
2471     unscaled = (srcW == dstW && srcH == dstH);
2472     needsDither= (isBGR(dstFormat) || isRGB(dstFormat))
2473         && (fmt_depth(dstFormat))<24
2474         && ((fmt_depth(dstFormat))<(fmt_depth(srcFormat)) || (!(isRGB(srcFormat) || isBGR(srcFormat))));
2475
2476     srcRange = handle_jpeg(&srcFormat);
2477     dstRange = handle_jpeg(&dstFormat);
2478
2479     if (!isSupportedIn(srcFormat)) {
2480         av_log(NULL, AV_LOG_ERROR, "swScaler: %s is not supported as input pixel format\n", sws_format_name(srcFormat));
2481         return NULL;
2482     }
2483     if (!isSupportedOut(dstFormat)) {
2484         av_log(NULL, AV_LOG_ERROR, "swScaler: %s is not supported as output pixel format\n", sws_format_name(dstFormat));
2485         return NULL;
2486     }
2487
2488     i= flags & ( SWS_POINT
2489                 |SWS_AREA
2490                 |SWS_BILINEAR
2491                 |SWS_FAST_BILINEAR
2492                 |SWS_BICUBIC
2493                 |SWS_X
2494                 |SWS_GAUSS
2495                 |SWS_LANCZOS
2496                 |SWS_SINC
2497                 |SWS_SPLINE
2498                 |SWS_BICUBLIN);
2499     if(!i || (i & (i-1))) {
2500         av_log(NULL, AV_LOG_ERROR, "swScaler: Exactly one scaler algorithm must be chosen\n");
2501         return NULL;
2502     }
2503
2504     /* sanity check */
2505     if (srcW<4 || srcH<1 || dstW<8 || dstH<1) { //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
2506         av_log(NULL, AV_LOG_ERROR, "swScaler: %dx%d -> %dx%d is invalid scaling dimension\n",
2507                srcW, srcH, dstW, dstH);
2508         return NULL;
2509     }
2510     if(srcW > VOFW || dstW > VOFW) {
2511         av_log(NULL, AV_LOG_ERROR, "swScaler: Compile-time maximum width is "AV_STRINGIFY(VOFW)" change VOF/VOFW and recompile\n");
2512         return NULL;
2513     }
2514
2515     if (!dstFilter) dstFilter= &dummyFilter;
2516     if (!srcFilter) srcFilter= &dummyFilter;
2517
2518     FF_ALLOCZ_OR_GOTO(NULL, c, sizeof(SwsContext), fail);
2519
2520     c->av_class = &sws_context_class;
2521     c->srcW= srcW;
2522     c->srcH= srcH;
2523     c->dstW= dstW;
2524     c->dstH= dstH;
2525     c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
2526     c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
2527     c->flags= flags;
2528     c->dstFormat= dstFormat;
2529     c->srcFormat= srcFormat;
2530     c->vRounder= 4* 0x0001000100010001ULL;
2531
2532     usesHFilter= usesVFilter= 0;
2533     if (dstFilter->lumV && dstFilter->lumV->length>1) usesVFilter=1;
2534     if (dstFilter->lumH && dstFilter->lumH->length>1) usesHFilter=1;
2535     if (dstFilter->chrV && dstFilter->chrV->length>1) usesVFilter=1;
2536     if (dstFilter->chrH && dstFilter->chrH->length>1) usesHFilter=1;
2537     if (srcFilter->lumV && srcFilter->lumV->length>1) usesVFilter=1;
2538     if (srcFilter->lumH && srcFilter->lumH->length>1) usesHFilter=1;
2539     if (srcFilter->chrV && srcFilter->chrV->length>1) usesVFilter=1;
2540     if (srcFilter->chrH && srcFilter->chrH->length>1) usesHFilter=1;
2541
2542     getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
2543     getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
2544
2545     // reuse chroma for 2 pixels RGB/BGR unless user wants full chroma interpolation
2546     if ((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
2547
2548     // drop some chroma lines if the user wants it
2549     c->vChrDrop= (flags&SWS_SRC_V_CHR_DROP_MASK)>>SWS_SRC_V_CHR_DROP_SHIFT;
2550     c->chrSrcVSubSample+= c->vChrDrop;
2551
2552     // drop every other pixel for chroma calculation unless user wants full chroma
2553     if ((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP)
2554       && srcFormat!=PIX_FMT_RGB8      && srcFormat!=PIX_FMT_BGR8
2555       && srcFormat!=PIX_FMT_RGB4      && srcFormat!=PIX_FMT_BGR4
2556       && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE
2557       && ((dstW>>c->chrDstHSubSample) <= (srcW>>1) || (flags&(SWS_FAST_BILINEAR|SWS_POINT))))
2558         c->chrSrcHSubSample=1;
2559
2560     if (param) {
2561         c->param[0] = param[0];
2562         c->param[1] = param[1];
2563     } else {
2564         c->param[0] =
2565         c->param[1] = SWS_PARAM_DEFAULT;
2566     }
2567
2568     // Note the -((-x)>>y) is so that we always round toward +inf.
2569     c->chrSrcW= -((-srcW) >> c->chrSrcHSubSample);
2570     c->chrSrcH= -((-srcH) >> c->chrSrcVSubSample);
2571     c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
2572     c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
2573
2574     sws_setColorspaceDetails(c, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], srcRange, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT] /* FIXME*/, dstRange, 0, 1<<16, 1<<16);
2575
2576     /* unscaled special cases */
2577     if (unscaled && !usesHFilter && !usesVFilter && (srcRange == dstRange || isBGR(dstFormat) || isRGB(dstFormat))) {
2578         /* yv12_to_nv12 */
2579         if ((srcFormat == PIX_FMT_YUV420P || srcFormat == PIX_FMT_YUVA420P) && (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)) {
2580             c->swScale= PlanarToNV12Wrapper;
2581         }
2582         /* yuv2bgr */
2583         if ((srcFormat==PIX_FMT_YUV420P || srcFormat==PIX_FMT_YUV422P || srcFormat==PIX_FMT_YUVA420P) && (isBGR(dstFormat) || isRGB(dstFormat))
2584             && !(flags & SWS_ACCURATE_RND) && !(dstH&1)) {
2585             c->swScale= ff_yuv2rgb_get_func_ptr(c);
2586         }
2587
2588         if (srcFormat==PIX_FMT_YUV410P && (dstFormat==PIX_FMT_YUV420P || dstFormat==PIX_FMT_YUVA420P) && !(flags & SWS_BITEXACT)) {
2589             c->swScale= yvu9toyv12Wrapper;
2590         }
2591
2592         /* bgr24toYV12 */
2593         if (srcFormat==PIX_FMT_BGR24 && (dstFormat==PIX_FMT_YUV420P || dstFormat==PIX_FMT_YUVA420P) && !(flags & SWS_ACCURATE_RND))
2594             c->swScale= bgr24toyv12Wrapper;
2595
2596         /* RGB/BGR -> RGB/BGR (no dither needed forms) */
2597         if (  (isBGR(srcFormat) || isRGB(srcFormat))
2598            && (isBGR(dstFormat) || isRGB(dstFormat))
2599            && srcFormat != PIX_FMT_BGR8      && dstFormat != PIX_FMT_BGR8
2600            && srcFormat != PIX_FMT_RGB8      && dstFormat != PIX_FMT_RGB8
2601            && srcFormat != PIX_FMT_BGR4      && dstFormat != PIX_FMT_BGR4
2602            && srcFormat != PIX_FMT_RGB4      && dstFormat != PIX_FMT_RGB4
2603            && srcFormat != PIX_FMT_BGR4_BYTE && dstFormat != PIX_FMT_BGR4_BYTE
2604            && srcFormat != PIX_FMT_RGB4_BYTE && dstFormat != PIX_FMT_RGB4_BYTE
2605            && srcFormat != PIX_FMT_MONOBLACK && dstFormat != PIX_FMT_MONOBLACK
2606            && srcFormat != PIX_FMT_MONOWHITE && dstFormat != PIX_FMT_MONOWHITE
2607                                              && dstFormat != PIX_FMT_RGB32_1
2608                                              && dstFormat != PIX_FMT_BGR32_1
2609            && srcFormat != PIX_FMT_RGB48LE   && dstFormat != PIX_FMT_RGB48LE
2610            && srcFormat != PIX_FMT_RGB48BE   && dstFormat != PIX_FMT_RGB48BE
2611            && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
2612              c->swScale= rgb2rgbWrapper;
2613
2614         if ((usePal(srcFormat) && (
2615                  dstFormat == PIX_FMT_RGB32   ||
2616                  dstFormat == PIX_FMT_RGB32_1 ||
2617                  dstFormat == PIX_FMT_RGB24   ||
2618                  dstFormat == PIX_FMT_BGR32   ||
2619                  dstFormat == PIX_FMT_BGR32_1 ||
2620                  dstFormat == PIX_FMT_BGR24)))
2621              c->swScale= pal2rgbWrapper;
2622
2623         if (srcFormat == PIX_FMT_YUV422P) {
2624             if (dstFormat == PIX_FMT_YUYV422)
2625                 c->swScale= YUV422PToYuy2Wrapper;
2626             else if (dstFormat == PIX_FMT_UYVY422)
2627                 c->swScale= YUV422PToUyvyWrapper;
2628         }
2629
2630         /* LQ converters if -sws 0 or -sws 4*/
2631         if (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)) {
2632             /* yv12_to_yuy2 */
2633             if (srcFormat == PIX_FMT_YUV420P || srcFormat == PIX_FMT_YUVA420P) {
2634                 if (dstFormat == PIX_FMT_YUYV422)
2635                     c->swScale= PlanarToYuy2Wrapper;
2636                 else if (dstFormat == PIX_FMT_UYVY422)
2637                     c->swScale= PlanarToUyvyWrapper;
2638             }
2639         }
2640         if(srcFormat == PIX_FMT_YUYV422 && (dstFormat == PIX_FMT_YUV420P || dstFormat == PIX_FMT_YUVA420P))
2641             c->swScale= YUYV2YUV420Wrapper;
2642         if(srcFormat == PIX_FMT_UYVY422 && (dstFormat == PIX_FMT_YUV420P || dstFormat == PIX_FMT_YUVA420P))
2643             c->swScale= UYVY2YUV420Wrapper;
2644         if(srcFormat == PIX_FMT_YUYV422 && dstFormat == PIX_FMT_YUV422P)
2645             c->swScale= YUYV2YUV422Wrapper;
2646         if(srcFormat == PIX_FMT_UYVY422 && dstFormat == PIX_FMT_YUV422P)
2647             c->swScale= UYVY2YUV422Wrapper;
2648
2649 #ifdef COMPILE_ALTIVEC
2650         if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
2651             !(c->flags & SWS_BITEXACT) &&
2652             srcFormat == PIX_FMT_YUV420P) {
2653           // unscaled YV12 -> packed YUV, we want speed
2654           if (dstFormat == PIX_FMT_YUYV422)
2655               c->swScale= yv12toyuy2_unscaled_altivec;
2656           else if (dstFormat == PIX_FMT_UYVY422)
2657               c->swScale= yv12touyvy_unscaled_altivec;
2658         }
2659 #endif
2660
2661         /* simple copy */
2662         if (  srcFormat == dstFormat
2663             || (srcFormat == PIX_FMT_YUVA420P && dstFormat == PIX_FMT_YUV420P)
2664             || (srcFormat == PIX_FMT_YUV420P && dstFormat == PIX_FMT_YUVA420P)
2665             || (isPlanarYUV(srcFormat) && isGray(dstFormat))
2666             || (isPlanarYUV(dstFormat) && isGray(srcFormat))
2667             || (isGray(dstFormat) && isGray(srcFormat))
2668             || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)
2669                 && c->chrDstHSubSample == c->chrSrcHSubSample
2670                 && c->chrDstVSubSample == c->chrSrcVSubSample
2671                 && dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21
2672                 && srcFormat != PIX_FMT_NV12 && srcFormat != PIX_FMT_NV21))
2673         {
2674             if (isPacked(c->srcFormat))
2675                 c->swScale= packedCopy;
2676             else /* Planar YUV or gray */
2677                 c->swScale= planarCopy;
2678         }
2679 #if ARCH_BFIN
2680         if (flags & SWS_CPU_CAPS_BFIN)
2681             ff_bfin_get_unscaled_swscale (c);
2682 #endif
2683
2684         if (c->swScale) {
2685             if (flags&SWS_PRINT_INFO)
2686                 av_log(c, AV_LOG_INFO, "using unscaled %s -> %s special converter\n",
2687                        sws_format_name(srcFormat), sws_format_name(dstFormat));
2688             return c;
2689         }
2690     }
2691
2692     if (flags & SWS_CPU_CAPS_MMX2) {
2693         c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2694         if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) {
2695             if (flags&SWS_PRINT_INFO)
2696                 av_log(c, AV_LOG_INFO, "output width is not a multiple of 32 -> no MMX2 scaler\n");
2697         }
2698         if (usesHFilter) c->canMMX2BeUsed=0;
2699     }
2700     else
2701         c->canMMX2BeUsed=0;
2702
2703     c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
2704     c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
2705
2706     // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2707     // but only for the FAST_BILINEAR mode otherwise do correct scaling
2708     // n-2 is the last chrominance sample available
2709     // this is not perfect, but no one should notice the difference, the more correct variant
2710     // would be like the vertical one, but that would require some special code for the
2711     // first and last pixel
2712     if (flags&SWS_FAST_BILINEAR) {
2713         if (c->canMMX2BeUsed) {
2714             c->lumXInc+= 20;
2715             c->chrXInc+= 20;
2716         }
2717         //we don't use the x86 asm scaler if MMX is available
2718         else if (flags & SWS_CPU_CAPS_MMX) {
2719             c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2720             c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
2721         }
2722     }
2723
2724     /* precalculate horizontal scaler filter coefficients */
2725     {
2726         const int filterAlign=
2727             (flags & SWS_CPU_CAPS_MMX) ? 4 :
2728             (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
2729             1;
2730
2731         if (initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
2732                        srcW      ,       dstW, filterAlign, 1<<14,
2733                        (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
2734                        srcFilter->lumH, dstFilter->lumH, c->param) < 0)
2735             goto fail;
2736         if (initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
2737                        c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
2738                        (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
2739                        srcFilter->chrH, dstFilter->chrH, c->param) < 0)
2740             goto fail;
2741
2742 #if defined(COMPILE_MMX2)
2743 // can't downscale !!!
2744         if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) {
2745             c->lumMmx2FilterCodeSize = initMMX2HScaler(      dstW, c->lumXInc, NULL, NULL, NULL, 8);
2746             c->chrMmx2FilterCodeSize = initMMX2HScaler(c->chrDstW, c->chrXInc, NULL, NULL, NULL, 4);
2747
2748 #ifdef MAP_ANONYMOUS
2749             c->lumMmx2FilterCode = mmap(NULL, c->lumMmx2FilterCodeSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
2750             c->chrMmx2FilterCode = mmap(NULL, c->chrMmx2FilterCodeSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
2751 #elif HAVE_VIRTUALALLOC
2752             c->lumMmx2FilterCode = VirtualAlloc(NULL, c->lumMmx2FilterCodeSize, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
2753             c->chrMmx2FilterCode = VirtualAlloc(NULL, c->chrMmx2FilterCodeSize, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
2754 #else
2755             c->lumMmx2FilterCode = av_malloc(c->lumMmx2FilterCodeSize);
2756             c->chrMmx2FilterCode = av_malloc(c->chrMmx2FilterCodeSize);
2757 #endif
2758
2759             FF_ALLOCZ_OR_GOTO(c, c->lumMmx2Filter   , (dstW        /8+8)*sizeof(int16_t), fail);
2760             FF_ALLOCZ_OR_GOTO(c, c->chrMmx2Filter   , (c->chrDstW  /4+8)*sizeof(int16_t), fail);
2761             FF_ALLOCZ_OR_GOTO(c, c->lumMmx2FilterPos, (dstW      /2/8+8)*sizeof(int32_t), fail);
2762             FF_ALLOCZ_OR_GOTO(c, c->chrMmx2FilterPos, (c->chrDstW/2/4+8)*sizeof(int32_t), fail);
2763
2764             initMMX2HScaler(      dstW, c->lumXInc, c->lumMmx2FilterCode, c->lumMmx2Filter, c->lumMmx2FilterPos, 8);
2765             initMMX2HScaler(c->chrDstW, c->chrXInc, c->chrMmx2FilterCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
2766
2767 #ifdef MAP_ANONYMOUS
2768             mprotect(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
2769             mprotect(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
2770 #endif
2771         }
2772 #endif /* defined(COMPILE_MMX2) */
2773     } // initialize horizontal stuff
2774
2775
2776
2777     /* precalculate vertical scaler filter coefficients */
2778     {
2779         const int filterAlign=
2780             (flags & SWS_CPU_CAPS_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
2781             (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
2782             1;
2783
2784         if (initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
2785                        srcH      ,        dstH, filterAlign, (1<<12),
2786                        (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
2787                        srcFilter->lumV, dstFilter->lumV, c->param) < 0)
2788             goto fail;
2789         if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
2790                        c->chrSrcH, c->chrDstH, filterAlign, (1<<12),
2791                        (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
2792                        srcFilter->chrV, dstFilter->chrV, c->param) < 0)
2793             goto fail;
2794
2795 #ifdef COMPILE_ALTIVEC
2796         FF_ALLOC_OR_GOTO(c, c->vYCoeffsBank, sizeof (vector signed short)*c->vLumFilterSize*c->dstH, fail);
2797         FF_ALLOC_OR_GOTO(c, c->vCCoeffsBank, sizeof (vector signed short)*c->vChrFilterSize*c->chrDstH, fail);
2798
2799         for (i=0;i<c->vLumFilterSize*c->dstH;i++) {
2800             int j;
2801             short *p = (short *)&c->vYCoeffsBank[i];
2802             for (j=0;j<8;j++)
2803                 p[j] = c->vLumFilter[i];
2804         }
2805
2806         for (i=0;i<c->vChrFilterSize*c->chrDstH;i++) {
2807             int j;
2808             short *p = (short *)&c->vCCoeffsBank[i];
2809             for (j=0;j<8;j++)
2810                 p[j] = c->vChrFilter[i];
2811         }
2812 #endif
2813     }
2814
2815     // calculate buffer sizes so that they won't run out while handling these damn slices
2816     c->vLumBufSize= c->vLumFilterSize;
2817     c->vChrBufSize= c->vChrFilterSize;
2818     for (i=0; i<dstH; i++) {
2819         int chrI= i*c->chrDstH / dstH;
2820         int nextSlice= FFMAX(c->vLumFilterPos[i   ] + c->vLumFilterSize - 1,
2821                            ((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)<<c->chrSrcVSubSample));
2822
2823         nextSlice>>= c->chrSrcVSubSample;
2824         nextSlice<<= c->chrSrcVSubSample;
2825         if (c->vLumFilterPos[i   ] + c->vLumBufSize < nextSlice)
2826             c->vLumBufSize= nextSlice - c->vLumFilterPos[i];
2827         if (c->vChrFilterPos[chrI] + c->vChrBufSize < (nextSlice>>c->chrSrcVSubSample))
2828             c->vChrBufSize= (nextSlice>>c->chrSrcVSubSample) - c->vChrFilterPos[chrI];
2829     }
2830
2831     // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2832     // allocate several megabytes to handle all possible cases)
2833     FF_ALLOC_OR_GOTO(c, c->lumPixBuf, c->vLumBufSize*2*sizeof(int16_t*), fail);
2834     FF_ALLOC_OR_GOTO(c, c->chrPixBuf, c->vChrBufSize*2*sizeof(int16_t*), fail);
2835     if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat) && isALPHA(c->dstFormat))
2836         FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf, c->vLumBufSize*2*sizeof(int16_t*), fail);
2837     //Note we need at least one pixel more at the end because of the MMX code (just in case someone wanna replace the 4000/8000)
2838     /* align at 16 bytes for AltiVec */
2839     for (i=0; i<c->vLumBufSize; i++) {
2840         FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf[i+c->vLumBufSize], VOF+1, fail);
2841         c->lumPixBuf[i] = c->lumPixBuf[i+c->vLumBufSize];
2842     }
2843     for (i=0; i<c->vChrBufSize; i++) {
2844         FF_ALLOC_OR_GOTO(c, c->chrPixBuf[i+c->vChrBufSize], (VOF+1)*2, fail);
2845         c->chrPixBuf[i] = c->chrPixBuf[i+c->vChrBufSize];
2846     }
2847     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
2848         for (i=0; i<c->vLumBufSize; i++) {
2849             FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf[i+c->vLumBufSize], VOF+1, fail);
2850             c->alpPixBuf[i] = c->alpPixBuf[i+c->vLumBufSize];
2851         }
2852
2853     //try to avoid drawing green stuff between the right end and the stride end
2854     for (i=0; i<c->vChrBufSize; i++) memset(c->chrPixBuf[i], 64, (VOF+1)*2);
2855
2856     assert(2*VOFW == VOF);
2857
2858     assert(c->chrDstH <= dstH);
2859
2860     if (flags&SWS_PRINT_INFO) {
2861 #ifdef DITHER1XBPP
2862         const char *dither= " dithered";
2863 #else
2864         const char *dither= "";
2865 #endif
2866         if (flags&SWS_FAST_BILINEAR)
2867             av_log(c, AV_LOG_INFO, "FAST_BILINEAR scaler, ");
2868         else if (flags&SWS_BILINEAR)
2869             av_log(c, AV_LOG_INFO, "BILINEAR scaler, ");
2870         else if (flags&SWS_BICUBIC)
2871             av_log(c, AV_LOG_INFO, "BICUBIC scaler, ");
2872         else if (flags&SWS_X)
2873             av_log(c, AV_LOG_INFO, "Experimental scaler, ");
2874         else if (flags&SWS_POINT)
2875             av_log(c, AV_LOG_INFO, "Nearest Neighbor / POINT scaler, ");
2876         else if (flags&SWS_AREA)
2877             av_log(c, AV_LOG_INFO, "Area Averageing scaler, ");
2878         else if (flags&SWS_BICUBLIN)
2879             av_log(c, AV_LOG_INFO, "luma BICUBIC / chroma BILINEAR scaler, ");
2880         else if (flags&SWS_GAUSS)
2881             av_log(c, AV_LOG_INFO, "Gaussian scaler, ");
2882         else if (flags&SWS_SINC)
2883             av_log(c, AV_LOG_INFO, "Sinc scaler, ");
2884         else if (flags&SWS_LANCZOS)
2885             av_log(c, AV_LOG_INFO, "Lanczos scaler, ");
2886         else if (flags&SWS_SPLINE)
2887             av_log(c, AV_LOG_INFO, "Bicubic spline scaler, ");
2888         else
2889             av_log(c, AV_LOG_INFO, "ehh flags invalid?! ");
2890
2891         if (dstFormat==PIX_FMT_BGR555 || dstFormat==PIX_FMT_BGR565)
2892             av_log(c, AV_LOG_INFO, "from %s to%s %s ",
2893                    sws_format_name(srcFormat), dither, sws_format_name(dstFormat));
2894         else
2895             av_log(c, AV_LOG_INFO, "from %s to %s ",
2896                    sws_format_name(srcFormat), sws_format_name(dstFormat));
2897
2898         if (flags & SWS_CPU_CAPS_MMX2)
2899             av_log(c, AV_LOG_INFO, "using MMX2\n");
2900         else if (flags & SWS_CPU_CAPS_3DNOW)
2901             av_log(c, AV_LOG_INFO, "using 3DNOW\n");
2902         else if (flags & SWS_CPU_CAPS_MMX)
2903             av_log(c, AV_LOG_INFO, "using MMX\n");
2904         else if (flags & SWS_CPU_CAPS_ALTIVEC)
2905             av_log(c, AV_LOG_INFO, "using AltiVec\n");
2906         else
2907             av_log(c, AV_LOG_INFO, "using C\n");
2908     }
2909
2910     if (flags & SWS_PRINT_INFO) {
2911         if (flags & SWS_CPU_CAPS_MMX) {
2912             if (c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
2913                 av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2914             else {
2915                 if (c->hLumFilterSize==4)
2916                     av_log(c, AV_LOG_VERBOSE, "using 4-tap MMX scaler for horizontal luminance scaling\n");
2917                 else if (c->hLumFilterSize==8)
2918                     av_log(c, AV_LOG_VERBOSE, "using 8-tap MMX scaler for horizontal luminance scaling\n");
2919                 else
2920                     av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal luminance scaling\n");
2921
2922                 if (c->hChrFilterSize==4)
2923                     av_log(c, AV_LOG_VERBOSE, "using 4-tap MMX scaler for horizontal chrominance scaling\n");
2924                 else if (c->hChrFilterSize==8)
2925                     av_log(c, AV_LOG_VERBOSE, "using 8-tap MMX scaler for horizontal chrominance scaling\n");
2926                 else
2927                     av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal chrominance scaling\n");
2928             }
2929         } else {
2930 #if ARCH_X86
2931             av_log(c, AV_LOG_VERBOSE, "using x86 asm scaler for horizontal scaling\n");
2932 #else
2933             if (flags & SWS_FAST_BILINEAR)
2934                 av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR C scaler for horizontal scaling\n");
2935             else
2936                 av_log(c, AV_LOG_VERBOSE, "using C scaler for horizontal scaling\n");
2937 #endif
2938         }
2939         if (isPlanarYUV(dstFormat)) {
2940             if (c->vLumFilterSize==1)
2941                 av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2942             else
2943                 av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2944         } else {
2945             if (c->vLumFilterSize==1 && c->vChrFilterSize==2)
2946                 av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2947                        "      2-tap scaler for vertical chrominance scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2948             else if (c->vLumFilterSize==2 && c->vChrFilterSize==2)
2949                 av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2950             else
2951                 av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2952         }
2953
2954         if (dstFormat==PIX_FMT_BGR24)
2955             av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR24 converter\n",
2956                    (flags & SWS_CPU_CAPS_MMX2) ? "MMX2" : ((flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"));
2957         else if (dstFormat==PIX_FMT_RGB32)
2958             av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2959         else if (dstFormat==PIX_FMT_BGR565)
2960             av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2961         else if (dstFormat==PIX_FMT_BGR555)
2962             av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
2963
2964         av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2965     }
2966     if (flags & SWS_PRINT_INFO) {
2967         av_log(c, AV_LOG_DEBUG, "lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2968                c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
2969         av_log(c, AV_LOG_DEBUG, "chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
2970                c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
2971     }
2972
2973     c->swScale= getSwsFunc(c);
2974     return c;
2975
2976 fail:
2977     sws_freeContext(c);
2978     return NULL;
2979 }
2980
2981 static void reset_ptr(uint8_t* src[], int format)
2982 {
2983     if(!isALPHA(format))
2984         src[3]=NULL;
2985     if(!isPlanarYUV(format)) {
2986         src[3]=src[2]=NULL;
2987         if(   format != PIX_FMT_PAL8
2988            && format != PIX_FMT_RGB8
2989            && format != PIX_FMT_BGR8
2990            && format != PIX_FMT_RGB4_BYTE
2991            && format != PIX_FMT_BGR4_BYTE
2992           )
2993             src[1]= NULL;
2994     }
2995 }
2996
2997 /**
2998  * swscale wrapper, so we don't need to export the SwsContext.
2999  * Assumes planar YUV to be in YUV order instead of YVU.
3000  */
3001 int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
3002               int srcSliceH, uint8_t* dst[], int dstStride[])
3003 {
3004     int i;
3005     uint8_t* src2[4]= {src[0], src[1], src[2], src[3]};
3006     uint8_t* dst2[4]= {dst[0], dst[1], dst[2], dst[3]};
3007
3008     if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
3009         av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
3010         return 0;
3011     }
3012     if (c->sliceDir == 0) {
3013         if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
3014     }
3015
3016     if (usePal(c->srcFormat)) {
3017         for (i=0; i<256; i++) {
3018             int p, r, g, b,y,u,v;
3019             if(c->srcFormat == PIX_FMT_PAL8) {
3020                 p=((uint32_t*)(src[1]))[i];
3021                 r= (p>>16)&0xFF;
3022                 g= (p>> 8)&0xFF;
3023                 b=  p     &0xFF;
3024             } else if(c->srcFormat == PIX_FMT_RGB8) {
3025                 r= (i>>5    )*36;
3026                 g= ((i>>2)&7)*36;
3027                 b= (i&3     )*85;
3028             } else if(c->srcFormat == PIX_FMT_BGR8) {
3029                 b= (i>>6    )*85;
3030                 g= ((i>>3)&7)*36;
3031                 r= (i&7     )*36;
3032             } else if(c->srcFormat == PIX_FMT_RGB4_BYTE) {
3033                 r= (i>>3    )*255;
3034                 g= ((i>>1)&3)*85;
3035                 b= (i&1     )*255;
3036             } else {
3037                 assert(c->srcFormat == PIX_FMT_BGR4_BYTE);
3038                 b= (i>>3    )*255;
3039                 g= ((i>>1)&3)*85;
3040                 r= (i&1     )*255;
3041             }
3042             y= av_clip_uint8((RY*r + GY*g + BY*b + ( 33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
3043             u= av_clip_uint8((RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
3044             v= av_clip_uint8((RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
3045             c->pal_yuv[i]= y + (u<<8) + (v<<16);
3046
3047
3048             switch(c->dstFormat) {
3049             case PIX_FMT_BGR32:
3050 #if !HAVE_BIGENDIAN
3051             case PIX_FMT_RGB24:
3052 #endif
3053                 c->pal_rgb[i]=  r + (g<<8) + (b<<16);
3054                 break;
3055             case PIX_FMT_BGR32_1:
3056 #if HAVE_BIGENDIAN
3057             case PIX_FMT_BGR24:
3058 #endif
3059                 c->pal_rgb[i]= (r + (g<<8) + (b<<16)) << 8;
3060                 break;
3061             case PIX_FMT_RGB32_1:
3062 #if HAVE_BIGENDIAN
3063             case PIX_FMT_RGB24:
3064 #endif
3065                 c->pal_rgb[i]= (b + (g<<8) + (r<<16)) << 8;
3066                 break;
3067             case PIX_FMT_RGB32:
3068 #if !HAVE_BIGENDIAN
3069             case PIX_FMT_BGR24:
3070 #endif
3071             default:
3072                 c->pal_rgb[i]=  b + (g<<8) + (r<<16);
3073             }
3074         }
3075     }
3076
3077     // copy strides, so they can safely be modified
3078     if (c->sliceDir == 1) {
3079         // slices go from top to bottom
3080         int srcStride2[4]= {srcStride[0], srcStride[1], srcStride[2], srcStride[3]};
3081         int dstStride2[4]= {dstStride[0], dstStride[1], dstStride[2], dstStride[3]};
3082
3083         reset_ptr(src2, c->srcFormat);
3084         reset_ptr(dst2, c->dstFormat);
3085
3086         /* reset slice direction at end of frame */
3087         if (srcSliceY + srcSliceH == c->srcH)
3088             c->sliceDir = 0;
3089
3090         return c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2, dstStride2);
3091     } else {
3092         // slices go from bottom to top => we flip the image internally
3093         int srcStride2[4]= {-srcStride[0], -srcStride[1], -srcStride[2], -srcStride[3]};
3094         int dstStride2[4]= {-dstStride[0], -dstStride[1], -dstStride[2], -dstStride[3]};
3095
3096         src2[0] += (srcSliceH-1)*srcStride[0];
3097         if (!usePal(c->srcFormat))
3098             src2[1] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[1];
3099         src2[2] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[2];
3100         src2[3] += (srcSliceH-1)*srcStride[3];
3101         dst2[0] += ( c->dstH                      -1)*dstStride[0];
3102         dst2[1] += ((c->dstH>>c->chrDstVSubSample)-1)*dstStride[1];
3103         dst2[2] += ((c->dstH>>c->chrDstVSubSample)-1)*dstStride[2];
3104         dst2[3] += ( c->dstH                      -1)*dstStride[3];
3105
3106         reset_ptr(src2, c->srcFormat);
3107         reset_ptr(dst2, c->dstFormat);
3108
3109         /* reset slice direction at end of frame */
3110         if (!srcSliceY)
3111             c->sliceDir = 0;
3112
3113         return c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH, srcSliceH, dst2, dstStride2);
3114     }
3115 }
3116
3117 #if LIBSWSCALE_VERSION_MAJOR < 1
3118 int sws_scale_ordered(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
3119                       int srcSliceH, uint8_t* dst[], int dstStride[])
3120 {
3121     return sws_scale(c, src, srcStride, srcSliceY, srcSliceH, dst, dstStride);
3122 }
3123 #endif
3124
3125 SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
3126                                 float lumaSharpen, float chromaSharpen,
3127                                 float chromaHShift, float chromaVShift,
3128                                 int verbose)
3129 {
3130     SwsFilter *filter= av_malloc(sizeof(SwsFilter));
3131     if (!filter)
3132         return NULL;
3133
3134     if (lumaGBlur!=0.0) {
3135         filter->lumH= sws_getGaussianVec(lumaGBlur, 3.0);
3136         filter->lumV= sws_getGaussianVec(lumaGBlur, 3.0);
3137     } else {
3138         filter->lumH= sws_getIdentityVec();
3139         filter->lumV= sws_getIdentityVec();
3140     }
3141
3142     if (chromaGBlur!=0.0) {
3143         filter->chrH= sws_getGaussianVec(chromaGBlur, 3.0);
3144         filter->chrV= sws_getGaussianVec(chromaGBlur, 3.0);
3145     } else {
3146         filter->chrH= sws_getIdentityVec();
3147         filter->chrV= sws_getIdentityVec();
3148     }
3149
3150     if (chromaSharpen!=0.0) {
3151         SwsVector *id= sws_getIdentityVec();
3152         sws_scaleVec(filter->chrH, -chromaSharpen);
3153         sws_scaleVec(filter->chrV, -chromaSharpen);
3154         sws_addVec(filter->chrH, id);
3155         sws_addVec(filter->chrV, id);
3156         sws_freeVec(id);
3157     }
3158
3159     if (lumaSharpen!=0.0) {
3160         SwsVector *id= sws_getIdentityVec();
3161         sws_scaleVec(filter->lumH, -lumaSharpen);
3162         sws_scaleVec(filter->lumV, -lumaSharpen);
3163         sws_addVec(filter->lumH, id);
3164         sws_addVec(filter->lumV, id);
3165         sws_freeVec(id);
3166     }
3167
3168     if (chromaHShift != 0.0)
3169         sws_shiftVec(filter->chrH, (int)(chromaHShift+0.5));
3170
3171     if (chromaVShift != 0.0)
3172         sws_shiftVec(filter->chrV, (int)(chromaVShift+0.5));
3173
3174     sws_normalizeVec(filter->chrH, 1.0);
3175     sws_normalizeVec(filter->chrV, 1.0);
3176     sws_normalizeVec(filter->lumH, 1.0);
3177     sws_normalizeVec(filter->lumV, 1.0);
3178
3179     if (verbose) sws_printVec2(filter->chrH, NULL, AV_LOG_DEBUG);
3180     if (verbose) sws_printVec2(filter->lumH, NULL, AV_LOG_DEBUG);
3181
3182     return filter;
3183 }
3184
3185 SwsVector *sws_allocVec(int length)
3186 {
3187     SwsVector *vec = av_malloc(sizeof(SwsVector));
3188     if (!vec)
3189         return NULL;
3190     vec->length = length;
3191     vec->coeff  = av_malloc(sizeof(double) * length);
3192     if (!vec->coeff)
3193         av_freep(&vec);
3194     return vec;
3195 }
3196
3197 SwsVector *sws_getGaussianVec(double variance, double quality)
3198 {
3199     const int length= (int)(variance*quality + 0.5) | 1;
3200     int i;
3201     double middle= (length-1)*0.5;
3202     SwsVector *vec= sws_allocVec(length);
3203
3204     if (!vec)
3205         return NULL;
3206
3207     for (i=0; i<length; i++) {
3208         double dist= i-middle;
3209         vec->coeff[i]= exp(-dist*dist/(2*variance*variance)) / sqrt(2*variance*PI);
3210     }
3211
3212     sws_normalizeVec(vec, 1.0);
3213
3214     return vec;
3215 }
3216
3217 SwsVector *sws_getConstVec(double c, int length)
3218 {
3219     int i;
3220     SwsVector *vec= sws_allocVec(length);
3221
3222     if (!vec)
3223         return NULL;
3224
3225     for (i=0; i<length; i++)
3226         vec->coeff[i]= c;
3227
3228     return vec;
3229 }
3230
3231
3232 SwsVector *sws_getIdentityVec(void)
3233 {
3234     return sws_getConstVec(1.0, 1);
3235 }
3236
3237 double sws_dcVec(SwsVector *a)
3238 {
3239     int i;
3240     double sum=0;
3241
3242     for (i=0; i<a->length; i++)
3243         sum+= a->coeff[i];
3244
3245     return sum;
3246 }
3247
3248 void sws_scaleVec(SwsVector *a, double scalar)
3249 {
3250     int i;
3251
3252     for (i=0; i<a->length; i++)
3253         a->coeff[i]*= scalar;
3254 }
3255
3256 void sws_normalizeVec(SwsVector *a, double height)
3257 {
3258     sws_scaleVec(a, height/sws_dcVec(a));
3259 }
3260
3261 static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b)
3262 {
3263     int length= a->length + b->length - 1;
3264     int i, j;
3265     SwsVector *vec= sws_getConstVec(0.0, length);
3266
3267     if (!vec)
3268         return NULL;
3269
3270     for (i=0; i<a->length; i++) {
3271         for (j=0; j<b->length; j++) {
3272             vec->coeff[i+j]+= a->coeff[i]*b->coeff[j];
3273         }
3274     }
3275
3276     return vec;
3277 }
3278
3279 static SwsVector *sws_sumVec(SwsVector *a, SwsVector *b)
3280 {
3281     int length= FFMAX(a->length, b->length);
3282     int i;
3283     SwsVector *vec= sws_getConstVec(0.0, length);
3284
3285     if (!vec)
3286         return NULL;
3287
3288     for (i=0; i<a->length; i++) vec->coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
3289     for (i=0; i<b->length; i++) vec->coeff[i + (length-1)/2 - (b->length-1)/2]+= b->coeff[i];
3290
3291     return vec;
3292 }
3293
3294 static SwsVector *sws_diffVec(SwsVector *a, SwsVector *b)
3295 {
3296     int length= FFMAX(a->length, b->length);
3297     int i;
3298     SwsVector *vec= sws_getConstVec(0.0, length);
3299
3300     if (!vec)
3301         return NULL;
3302
3303     for (i=0; i<a->length; i++) vec->coeff[i + (length-1)/2 - (a->length-1)/2]+= a->coeff[i];
3304     for (i=0; i<b->length; i++) vec->coeff[i + (length-1)/2 - (b->length-1)/2]-= b->coeff[i];
3305
3306     return vec;
3307 }
3308
3309 /* shift left / or right if "shift" is negative */
3310 static SwsVector *sws_getShiftedVec(SwsVector *a, int shift)
3311 {
3312     int length= a->length + FFABS(shift)*2;
3313     int i;
3314     SwsVector *vec= sws_getConstVec(0.0, length);
3315
3316     if (!vec)
3317         return NULL;
3318
3319     for (i=0; i<a->length; i++) {
3320         vec->coeff[i + (length-1)/2 - (a->length-1)/2 - shift]= a->coeff[i];
3321     }
3322
3323     return vec;
3324 }
3325
3326 void sws_shiftVec(SwsVector *a, int shift)
3327 {
3328     SwsVector *shifted= sws_getShiftedVec(a, shift);
3329     av_free(a->coeff);
3330     a->coeff= shifted->coeff;
3331     a->length= shifted->length;
3332     av_free(shifted);
3333 }
3334
3335 void sws_addVec(SwsVector *a, SwsVector *b)
3336 {
3337     SwsVector *sum= sws_sumVec(a, b);
3338     av_free(a->coeff);
3339     a->coeff= sum->coeff;
3340     a->length= sum->length;
3341     av_free(sum);
3342 }
3343
3344 void sws_subVec(SwsVector *a, SwsVector *b)
3345 {
3346     SwsVector *diff= sws_diffVec(a, b);
3347     av_free(a->coeff);
3348     a->coeff= diff->coeff;
3349     a->length= diff->length;
3350     av_free(diff);
3351 }
3352
3353 void sws_convVec(SwsVector *a, SwsVector *b)
3354 {
3355     SwsVector *conv= sws_getConvVec(a, b);
3356     av_free(a->coeff);
3357     a->coeff= conv->coeff;
3358     a->length= conv->length;
3359     av_free(conv);
3360 }
3361
3362 SwsVector *sws_cloneVec(SwsVector *a)
3363 {
3364     int i;
3365     SwsVector *vec= sws_allocVec(a->length);
3366
3367     if (!vec)
3368         return NULL;
3369
3370     for (i=0; i<a->length; i++) vec->coeff[i]= a->coeff[i];
3371
3372     return vec;
3373 }
3374
3375 void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level)
3376 {
3377     int i;
3378     double max=0;
3379     double min=0;
3380     double range;
3381
3382     for (i=0; i<a->length; i++)
3383         if (a->coeff[i]>max) max= a->coeff[i];
3384
3385     for (i=0; i<a->length; i++)
3386         if (a->coeff[i]<min) min= a->coeff[i];
3387
3388     range= max - min;
3389
3390     for (i=0; i<a->length; i++) {
3391         int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
3392         av_log(log_ctx, log_level, "%1.3f ", a->coeff[i]);
3393         for (;x>0; x--) av_log(log_ctx, log_level, " ");
3394         av_log(log_ctx, log_level, "|\n");
3395     }
3396 }
3397
3398 #if LIBSWSCALE_VERSION_MAJOR < 1
3399 void sws_printVec(SwsVector *a)
3400 {
3401     sws_printVec2(a, NULL, AV_LOG_DEBUG);
3402 }
3403 #endif
3404
3405 void sws_freeVec(SwsVector *a)
3406 {
3407     if (!a) return;
3408     av_freep(&a->coeff);
3409     a->length=0;
3410     av_free(a);
3411 }
3412
3413 void sws_freeFilter(SwsFilter *filter)
3414 {
3415     if (!filter) return;
3416
3417     if (filter->lumH) sws_freeVec(filter->lumH);
3418     if (filter->lumV) sws_freeVec(filter->lumV);
3419     if (filter->chrH) sws_freeVec(filter->chrH);
3420     if (filter->chrV) sws_freeVec(filter->chrV);
3421     av_free(filter);
3422 }
3423
3424
3425 void sws_freeContext(SwsContext *c)
3426 {
3427     int i;
3428     if (!c) return;
3429
3430     if (c->lumPixBuf) {
3431         for (i=0; i<c->vLumBufSize; i++)
3432             av_freep(&c->lumPixBuf[i]);
3433         av_freep(&c->lumPixBuf);
3434     }
3435
3436     if (c->chrPixBuf) {
3437         for (i=0; i<c->vChrBufSize; i++)
3438             av_freep(&c->chrPixBuf[i]);
3439         av_freep(&c->chrPixBuf);
3440     }
3441
3442     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
3443         for (i=0; i<c->vLumBufSize; i++)
3444             av_freep(&c->alpPixBuf[i]);
3445         av_freep(&c->alpPixBuf);
3446     }
3447
3448     av_freep(&c->vLumFilter);
3449     av_freep(&c->vChrFilter);
3450     av_freep(&c->hLumFilter);
3451     av_freep(&c->hChrFilter);
3452 #ifdef COMPILE_ALTIVEC
3453     av_freep(&c->vYCoeffsBank);
3454     av_freep(&c->vCCoeffsBank);
3455 #endif
3456
3457     av_freep(&c->vLumFilterPos);
3458     av_freep(&c->vChrFilterPos);
3459     av_freep(&c->hLumFilterPos);
3460     av_freep(&c->hChrFilterPos);
3461
3462 #if ARCH_X86 && CONFIG_GPL
3463 #ifdef MAP_ANONYMOUS
3464     if (c->lumMmx2FilterCode) munmap(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize);
3465     if (c->chrMmx2FilterCode) munmap(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize);
3466 #elif HAVE_VIRTUALALLOC
3467     if (c->lumMmx2FilterCode) VirtualFree(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize, MEM_RELEASE);
3468     if (c->chrMmx2FilterCode) VirtualFree(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize, MEM_RELEASE);
3469 #else
3470     av_free(c->lumMmx2FilterCode);
3471     av_free(c->chrMmx2FilterCode);
3472 #endif
3473     c->lumMmx2FilterCode=NULL;
3474     c->chrMmx2FilterCode=NULL;
3475 #endif /* ARCH_X86 && CONFIG_GPL */
3476
3477     av_freep(&c->lumMmx2Filter);
3478     av_freep(&c->chrMmx2Filter);
3479     av_freep(&c->lumMmx2FilterPos);
3480     av_freep(&c->chrMmx2FilterPos);
3481     av_freep(&c->yuvTable);
3482
3483     av_free(c);
3484 }
3485
3486 struct SwsContext *sws_getCachedContext(struct SwsContext *context,
3487                                         int srcW, int srcH, enum PixelFormat srcFormat,
3488                                         int dstW, int dstH, enum PixelFormat dstFormat, int flags,
3489                                         SwsFilter *srcFilter, SwsFilter *dstFilter, const double *param)
3490 {
3491     static const double default_param[2] = {SWS_PARAM_DEFAULT, SWS_PARAM_DEFAULT};
3492
3493     if (!param)
3494         param = default_param;
3495
3496     if (context) {
3497         if (context->srcW != srcW || context->srcH != srcH ||
3498             context->srcFormat != srcFormat ||
3499             context->dstW != dstW || context->dstH != dstH ||
3500             context->dstFormat != dstFormat || context->flags != flags ||
3501             context->param[0] != param[0] || context->param[1] != param[1])
3502         {
3503             sws_freeContext(context);
3504             context = NULL;
3505         }
3506     }
3507     if (!context) {
3508         return sws_getContext(srcW, srcH, srcFormat,
3509                               dstW, dstH, dstFormat, flags,
3510                               srcFilter, dstFilter, param);
3511     }
3512     return context;
3513 }
3514