git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define isPacked(x)         (       \
  73            (x)==PIX_FMT_PAL8        \
  74         || (x)==PIX_FMT_YUYV422     \
  75         || (x)==PIX_FMT_UYVY422     \
  76         || (x)==PIX_FMT_Y400A       \
  77         || isAnyRGB(x)              \
  78     )
  79
  80 #define RGB2YUV_SHIFT 15
  81 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  82 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  83 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  84 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  85 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  86 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  87 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  88 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  89 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  90
  91 static const double rgb2yuv_table[8][9]={
  92     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  93     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  94     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  95     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  96     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  97     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  98     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  99     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
 100 };
 101
 102 /*
 103 NOTES
 104 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
 105
 106 TODO
 107 more intelligent misalignment avoidance for the horizontal scaler
 108 write special vertical cubic upscale version
 109 optimize C code (YV12 / minmax)
 110 add support for packed pixel YUV input & output
 111 add support for Y8 output
 112 optimize BGR24 & BGR32
 113 add BGR4 output support
 114 write special BGR->BGR scaler
 115 */
 116
 117 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 118 {  1,   3,   1,   3,   1,   3,   1,   3, },
 119 {  2,   0,   2,   0,   2,   0,   2,   0, },
 120 };
 121
 122 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 123 {  6,   2,   6,   2,   6,   2,   6,   2, },
 124 {  0,   4,   0,   4,   0,   4,   0,   4, },
 125 };
 126
 127 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 128 {  8,   4,  11,   7,   8,   4,  11,   7, },
 129 {  2,  14,   1,  13,   2,  14,   1,  13, },
 130 { 10,   6,   9,   5,  10,   6,   9,   5, },
 131 {  0,  12,   3,  15,   0,  12,   3,  15, },
 132 };
 133
 134 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 135 { 17,   9,  23,  15,  16,   8,  22,  14, },
 136 {  5,  29,   3,  27,   4,  28,   2,  26, },
 137 { 21,  13,  19,  11,  20,  12,  18,  10, },
 138 {  0,  24,   6,  30,   1,  25,   7,  31, },
 139 { 16,   8,  22,  14,  17,   9,  23,  15, },
 140 {  4,  28,   2,  26,   5,  29,   3,  27, },
 141 { 20,  12,  18,  10,  21,  13,  19,  11, },
 142 {  1,  25,   7,  31,   0,  24,   6,  30, },
 143 };
 144
 145 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 146 {  0,  55,  14,  68,   3,  58,  17,  72, },
 147 { 37,  18,  50,  32,  40,  22,  54,  35, },
 148 {  9,  64,   5,  59,  13,  67,   8,  63, },
 149 { 46,  27,  41,  23,  49,  31,  44,  26, },
 150 {  2,  57,  16,  71,   1,  56,  15,  70, },
 151 { 39,  21,  52,  34,  38,  19,  51,  33, },
 152 { 11,  66,   7,  62,  10,  65,   6,  60, },
 153 { 48,  30,  43,  25,  47,  29,  42,  24, },
 154 };
 155
 156 #if 1
 157 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 158 {117,  62, 158, 103, 113,  58, 155, 100, },
 159 { 34, 199,  21, 186,  31, 196,  17, 182, },
 160 {144,  89, 131,  76, 141,  86, 127,  72, },
 161 {  0, 165,  41, 206,  10, 175,  52, 217, },
 162 {110,  55, 151,  96, 120,  65, 162, 107, },
 163 { 28, 193,  14, 179,  38, 203,  24, 189, },
 164 {138,  83, 124,  69, 148,  93, 134,  79, },
 165 {  7, 172,  48, 213,   3, 168,  45, 210, },
 166 };
 167 #elif 1
 168 // tries to correct a gamma of 1.5
 169 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 170 {  0, 143,  18, 200,   2, 156,  25, 215, },
 171 { 78,  28, 125,  64,  89,  36, 138,  74, },
 172 { 10, 180,   3, 161,  16, 195,   8, 175, },
 173 {109,  51,  93,  38, 121,  60, 105,  47, },
 174 {  1, 152,  23, 210,   0, 147,  20, 205, },
 175 { 85,  33, 134,  71,  81,  30, 130,  67, },
 176 { 14, 190,   6, 171,  12, 185,   5, 166, },
 177 {117,  57, 101,  44, 113,  54,  97,  41, },
 178 };
 179 #elif 1
 180 // tries to correct a gamma of 2.0
 181 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 182 {  0, 124,   8, 193,   0, 140,  12, 213, },
 183 { 55,  14, 104,  42,  66,  19, 119,  52, },
 184 {  3, 168,   1, 145,   6, 187,   3, 162, },
 185 { 86,  31,  70,  21,  99,  39,  82,  28, },
 186 {  0, 134,  11, 206,   0, 129,   9, 200, },
 187 { 62,  17, 114,  48,  58,  16, 109,  45, },
 188 {  5, 181,   2, 157,   4, 175,   1, 151, },
 189 { 95,  36,  78,  26,  90,  34,  74,  24, },
 190 };
 191 #else
 192 // tries to correct a gamma of 2.5
 193 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 194 {  0, 107,   3, 187,   0, 125,   6, 212, },
 195 { 39,   7,  86,  28,  49,  11, 102,  36, },
 196 {  1, 158,   0, 131,   3, 180,   1, 151, },
 197 { 68,  19,  52,  12,  81,  25,  64,  17, },
 198 {  0, 119,   5, 203,   0, 113,   4, 195, },
 199 { 45,   9,  96,  33,  42,   8,  91,  30, },
 200 {  2, 172,   1, 144,   2, 165,   0, 137, },
 201 { 77,  23,  60,  15,  72,  21,  56,  14, },
 202 };
 203 #endif
 204
 205 static av_always_inline void
 206 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 207                       int lumFilterSize, const int16_t *chrFilter,
 208                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 209                       int chrFilterSize, const int16_t **alpSrc,
 210                       uint16_t *dest, uint16_t *uDest, uint16_t *vDest,
 211                       uint16_t *aDest, int dstW, int chrDstW,
 212                       int big_endian, int output_bits)
 213 {
 214     //FIXME Optimize (just quickly written not optimized..)
 215     int i;
 216     int shift = 11 + 16 - output_bits;
 217
 218 #define output_pixel(pos, val) \
 219     if (big_endian) { \
 220         if (output_bits == 16) { \
 221             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 222         } else { \
 223             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 224         } \
 225     } else { \
 226         if (output_bits == 16) { \
 227             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 228         } else { \
 229             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 230         } \
 231     }
 232     for (i = 0; i < dstW; i++) {
 233         int val = 1 << (26-output_bits);
 234         int j;
 235
 236         for (j = 0; j < lumFilterSize; j++)
 237             val += lumSrc[j][i] * lumFilter[j];
 238
 239         output_pixel(&dest[i], val);
 240     }
 241
 242     if (uDest) {
 243         for (i = 0; i < chrDstW; i++) {
 244             int u = 1 << (26-output_bits);
 245             int v = 1 << (26-output_bits);
 246             int j;
 247
 248             for (j = 0; j < chrFilterSize; j++) {
 249                 u += chrUSrc[j][i] * chrFilter[j];
 250                 v += chrVSrc[j][i] * chrFilter[j];
 251             }
 252
 253             output_pixel(&uDest[i], u);
 254             output_pixel(&vDest[i], v);
 255         }
 256     }
 257
 258     if (CONFIG_SWSCALE_ALPHA && aDest) {
 259         for (i = 0; i < dstW; i++) {
 260             int val = 1 << (26-output_bits);
 261             int j;
 262
 263             for (j = 0; j < lumFilterSize; j++)
 264                 val += alpSrc[j][i] * lumFilter[j];
 265
 266             output_pixel(&aDest[i], val);
 267         }
 268     }
 269 }
 270
 271 #define yuv2NBPS(bits, BE_LE, is_be) \
 272 static void yuv2yuvX ## bits ## BE_LE ## _c(const int16_t *lumFilter, \
 273                               const int16_t **lumSrc, int lumFilterSize, \
 274                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 275                               const int16_t **chrVSrc, \
 276                               int chrFilterSize, const int16_t **alpSrc, \
 277                               uint16_t *dest, uint16_t *uDest, uint16_t *vDest, \
 278                               uint16_t *aDest, int dstW, int chrDstW) \
 279 { \
 280     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 281                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 282                           alpSrc, \
 283                           dest, uDest, vDest, aDest, \
 284                           dstW, chrDstW, is_be, bits); \
 285 }
 286 yuv2NBPS( 9, BE, 1);
 287 yuv2NBPS( 9, LE, 0);
 288 yuv2NBPS(10, BE, 1);
 289 yuv2NBPS(10, LE, 0);
 290 yuv2NBPS(16, BE, 1);
 291 yuv2NBPS(16, LE, 0);
 292
 293 static inline void yuv2yuvX16_c(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 294                                 const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize,
 295                                 const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest, int dstW, int chrDstW,
 296                                 enum PixelFormat dstFormat)
 297 {
 298 #define conv16(bits) \
 299     if (isBE(dstFormat)) { \
 300         yuv2yuvX ## bits ## BE_c(lumFilter, lumSrc, lumFilterSize, \
 301                                chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 302                                alpSrc, \
 303                                dest, uDest, vDest, aDest, \
 304                                dstW, chrDstW); \
 305     } else { \
 306         yuv2yuvX ## bits ## LE_c(lumFilter, lumSrc, lumFilterSize, \
 307                                chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 308                                alpSrc, \
 309                                dest, uDest, vDest, aDest, \
 310                                dstW, chrDstW); \
 311     }
 312     if (is16BPS(dstFormat)) {
 313         conv16(16);
 314     } else if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
 315         conv16(9);
 316     } else {
 317         conv16(10);
 318     }
 319 #undef conv16
 320 }
 321
 322 static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 323                               const int16_t **lumSrc, int lumFilterSize,
 324                               const int16_t *chrFilter, const int16_t **chrUSrc,
 325                               const int16_t **chrVSrc,
 326                               int chrFilterSize, const int16_t **alpSrc,
 327                               uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 328                               uint8_t *aDest, int dstW, int chrDstW)
 329 {
 330     //FIXME Optimize (just quickly written not optimized..)
 331     int i;
 332     for (i=0; i<dstW; i++) {
 333         int val=1<<18;
 334         int j;
 335         for (j=0; j<lumFilterSize; j++)
 336             val += lumSrc[j][i] * lumFilter[j];
 337
 338         dest[i]= av_clip_uint8(val>>19);
 339     }
 340
 341     if (uDest)
 342         for (i=0; i<chrDstW; i++) {
 343             int u=1<<18;
 344             int v=1<<18;
 345             int j;
 346             for (j=0; j<chrFilterSize; j++) {
 347                 u += chrUSrc[j][i] * chrFilter[j];
 348                 v += chrVSrc[j][i] * chrFilter[j];
 349             }
 350
 351             uDest[i]= av_clip_uint8(u>>19);
 352             vDest[i]= av_clip_uint8(v>>19);
 353         }
 354
 355     if (CONFIG_SWSCALE_ALPHA && aDest)
 356         for (i=0; i<dstW; i++) {
 357             int val=1<<18;
 358             int j;
 359             for (j=0; j<lumFilterSize; j++)
 360                 val += alpSrc[j][i] * lumFilter[j];
 361
 362             aDest[i]= av_clip_uint8(val>>19);
 363         }
 364
 365 }
 366
 367 static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 368                                const int16_t **lumSrc, int lumFilterSize,
 369                                const int16_t *chrFilter, const int16_t **chrUSrc,
 370                                const int16_t **chrVSrc,
 371                                int chrFilterSize, uint8_t *dest, uint8_t *uDest,
 372                                int dstW, int chrDstW, enum PixelFormat dstFormat)
 373 {
 374     //FIXME Optimize (just quickly written not optimized..)
 375     int i;
 376     for (i=0; i<dstW; i++) {
 377         int val=1<<18;
 378         int j;
 379         for (j=0; j<lumFilterSize; j++)
 380             val += lumSrc[j][i] * lumFilter[j];
 381
 382         dest[i]= av_clip_uint8(val>>19);
 383     }
 384
 385     if (!uDest)
 386         return;
 387
 388     if (dstFormat == PIX_FMT_NV12)
 389         for (i=0; i<chrDstW; i++) {
 390             int u=1<<18;
 391             int v=1<<18;
 392             int j;
 393             for (j=0; j<chrFilterSize; j++) {
 394                 u += chrUSrc[j][i] * chrFilter[j];
 395                 v += chrVSrc[j][i] * chrFilter[j];
 396             }
 397
 398             uDest[2*i]= av_clip_uint8(u>>19);
 399             uDest[2*i+1]= av_clip_uint8(v>>19);
 400         }
 401     else
 402         for (i=0; i<chrDstW; i++) {
 403             int u=1<<18;
 404             int v=1<<18;
 405             int j;
 406             for (j=0; j<chrFilterSize; j++) {
 407                 u += chrUSrc[j][i] * chrFilter[j];
 408                 v += chrVSrc[j][i] * chrFilter[j];
 409             }
 410
 411             uDest[2*i]= av_clip_uint8(v>>19);
 412             uDest[2*i+1]= av_clip_uint8(u>>19);
 413         }
 414 }
 415
 416 #define YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha) \
 417     for (i=0; i<(dstW>>1); i++) {\
 418         int j;\
 419         int Y1 = 1<<18;\
 420         int Y2 = 1<<18;\
 421         int U  = 1<<18;\
 422         int V  = 1<<18;\
 423         int av_unused A1, A2;\
 424         type av_unused *r, *b, *g;\
 425         const int i2= 2*i;\
 426         \
 427         for (j=0; j<lumFilterSize; j++) {\
 428             Y1 += lumSrc[j][i2] * lumFilter[j];\
 429             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 430         }\
 431         for (j=0; j<chrFilterSize; j++) {\
 432             U += chrUSrc[j][i] * chrFilter[j];\
 433             V += chrVSrc[j][i] * chrFilter[j];\
 434         }\
 435         Y1>>=19;\
 436         Y2>>=19;\
 437         U >>=19;\
 438         V >>=19;\
 439         if (alpha) {\
 440             A1 = 1<<18;\
 441             A2 = 1<<18;\
 442             for (j=0; j<lumFilterSize; j++) {\
 443                 A1 += alpSrc[j][i2  ] * lumFilter[j];\
 444                 A2 += alpSrc[j][i2+1] * lumFilter[j];\
 445             }\
 446             A1>>=19;\
 447             A2>>=19;\
 448         }
 449
 450 #define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
 451         YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha)\
 452         if ((Y1|Y2|U|V)&256) {\
 453             if (Y1>255)   Y1=255; \
 454             else if (Y1<0)Y1=0;   \
 455             if (Y2>255)   Y2=255; \
 456             else if (Y2<0)Y2=0;   \
 457             if (U>255)    U=255;  \
 458             else if (U<0) U=0;    \
 459             if (V>255)    V=255;  \
 460             else if (V<0) V=0;    \
 461         }\
 462         if (alpha && ((A1|A2)&256)) {\
 463             A1=av_clip_uint8(A1);\
 464             A2=av_clip_uint8(A2);\
 465         }
 466
 467 #define YSCALE_YUV_2_PACKEDX_FULL_C(rnd,alpha) \
 468     for (i=0; i<dstW; i++) {\
 469         int j;\
 470         int Y = 0;\
 471         int U = -128<<19;\
 472         int V = -128<<19;\
 473         int av_unused A;\
 474         int R,G,B;\
 475         \
 476         for (j=0; j<lumFilterSize; j++) {\
 477             Y += lumSrc[j][i     ] * lumFilter[j];\
 478         }\
 479         for (j=0; j<chrFilterSize; j++) {\
 480             U += chrUSrc[j][i] * chrFilter[j];\
 481             V += chrVSrc[j][i] * chrFilter[j];\
 482         }\
 483         Y >>=10;\
 484         U >>=10;\
 485         V >>=10;\
 486         if (alpha) {\
 487             A = rnd;\
 488             for (j=0; j<lumFilterSize; j++)\
 489                 A += alpSrc[j][i     ] * lumFilter[j];\
 490             A >>=19;\
 491             if (A&256)\
 492                 A = av_clip_uint8(A);\
 493         }
 494
 495 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
 496     YSCALE_YUV_2_PACKEDX_FULL_C(rnd>>3,alpha)\
 497         Y-= c->yuv2rgb_y_offset;\
 498         Y*= c->yuv2rgb_y_coeff;\
 499         Y+= rnd;\
 500         R= Y + V*c->yuv2rgb_v2r_coeff;\
 501         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
 502         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
 503         if ((R|G|B)&(0xC0000000)) {\
 504             if (R>=(256<<22))   R=(256<<22)-1; \
 505             else if (R<0)R=0;   \
 506             if (G>=(256<<22))   G=(256<<22)-1; \
 507             else if (G<0)G=0;   \
 508             if (B>=(256<<22))   B=(256<<22)-1; \
 509             else if (B<0)B=0;   \
 510         }
 511
 512 #define YSCALE_YUV_2_GRAY16_C \
 513     for (i=0; i<(dstW>>1); i++) {\
 514         int j;\
 515         int Y1 = 1<<18;\
 516         int Y2 = 1<<18;\
 517         int U  = 1<<18;\
 518         int V  = 1<<18;\
 519         \
 520         const int i2= 2*i;\
 521         \
 522         for (j=0; j<lumFilterSize; j++) {\
 523             Y1 += lumSrc[j][i2] * lumFilter[j];\
 524             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 525         }\
 526         Y1>>=11;\
 527         Y2>>=11;\
 528         if ((Y1|Y2|U|V)&65536) {\
 529             if (Y1>65535)   Y1=65535; \
 530             else if (Y1<0)Y1=0;   \
 531             if (Y2>65535)   Y2=65535; \
 532             else if (Y2<0)Y2=0;   \
 533         }
 534
 535 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
 536     YSCALE_YUV_2_PACKEDX_C(type,alpha)  /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
 537     r = (type *)c->table_rV[V];   \
 538     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
 539     b = (type *)c->table_bU[U];
 540
 541 #define YSCALE_YUV_2_PACKED2_C(type,alpha)   \
 542     for (i=0; i<(dstW>>1); i++) { \
 543         const int i2= 2*i;       \
 544         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>19;           \
 545         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;           \
 546         int U= (ubuf0[i]*uvalpha1+ubuf1[i]*uvalpha)>>19;              \
 547         int V= (vbuf0[i]*uvalpha1+vbuf1[i]*uvalpha)>>19;              \
 548         type av_unused *r, *b, *g;                                    \
 549         int av_unused A1, A2;                                         \
 550         if (alpha) {\
 551             A1= (abuf0[i2  ]*yalpha1+abuf1[i2  ]*yalpha)>>19;         \
 552             A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
 553         }
 554
 555 #define YSCALE_YUV_2_GRAY16_2_C   \
 556     for (i=0; i<(dstW>>1); i++) { \
 557         const int i2= 2*i;       \
 558         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>11;           \
 559         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>11;
 560
 561 #define YSCALE_YUV_2_RGB2_C(type,alpha) \
 562     YSCALE_YUV_2_PACKED2_C(type,alpha)\
 563     r = (type *)c->table_rV[V];\
 564     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 565     b = (type *)c->table_bU[U];
 566
 567 #define YSCALE_YUV_2_PACKED1_C(type,alpha) \
 568     for (i=0; i<(dstW>>1); i++) {\
 569         const int i2= 2*i;\
 570         int Y1= buf0[i2  ]>>7;\
 571         int Y2= buf0[i2+1]>>7;\
 572         int U= (ubuf1[i])>>7;\
 573         int V= (vbuf1[i])>>7;\
 574         type av_unused *r, *b, *g;\
 575         int av_unused A1, A2;\
 576         if (alpha) {\
 577             A1= abuf0[i2  ]>>7;\
 578             A2= abuf0[i2+1]>>7;\
 579         }
 580
 581 #define YSCALE_YUV_2_GRAY16_1_C \
 582     for (i=0; i<(dstW>>1); i++) {\
 583         const int i2= 2*i;\
 584         int Y1= buf0[i2  ]<<1;\
 585         int Y2= buf0[i2+1]<<1;
 586
 587 #define YSCALE_YUV_2_RGB1_C(type,alpha) \
 588     YSCALE_YUV_2_PACKED1_C(type,alpha)\
 589     r = (type *)c->table_rV[V];\
 590     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 591     b = (type *)c->table_bU[U];
 592
 593 #define YSCALE_YUV_2_PACKED1B_C(type,alpha) \
 594     for (i=0; i<(dstW>>1); i++) {\
 595         const int i2= 2*i;\
 596         int Y1= buf0[i2  ]>>7;\
 597         int Y2= buf0[i2+1]>>7;\
 598         int U= (ubuf0[i] + ubuf1[i])>>8;\
 599         int V= (vbuf0[i] + vbuf1[i])>>8;\
 600         type av_unused *r, *b, *g;\
 601         int av_unused A1, A2;\
 602         if (alpha) {\
 603             A1= abuf0[i2  ]>>7;\
 604             A2= abuf0[i2+1]>>7;\
 605         }
 606
 607 #define YSCALE_YUV_2_RGB1B_C(type,alpha) \
 608     YSCALE_YUV_2_PACKED1B_C(type,alpha)\
 609     r = (type *)c->table_rV[V];\
 610     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 611     b = (type *)c->table_bU[U];
 612
 613 #define YSCALE_YUV_2_MONO2_C \
 614     const uint8_t * const d128=dither_8x8_220[y&7];\
 615     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 616     for (i=0; i<dstW-7; i+=8) {\
 617         int acc;\
 618         acc =       g[((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19) + d128[0]];\
 619         acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
 620         acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
 621         acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
 622         acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
 623         acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
 624         acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
 625         acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
 626         ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 627         dest++;\
 628     }
 629
 630 #define YSCALE_YUV_2_MONOX_C \
 631     const uint8_t * const d128=dither_8x8_220[y&7];\
 632     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 633     int acc=0;\
 634     for (i=0; i<dstW-1; i+=2) {\
 635         int j;\
 636         int Y1=1<<18;\
 637         int Y2=1<<18;\
 638 \
 639         for (j=0; j<lumFilterSize; j++) {\
 640             Y1 += lumSrc[j][i] * lumFilter[j];\
 641             Y2 += lumSrc[j][i+1] * lumFilter[j];\
 642         }\
 643         Y1>>=19;\
 644         Y2>>=19;\
 645         if ((Y1|Y2)&256) {\
 646             if (Y1>255)   Y1=255;\
 647             else if (Y1<0)Y1=0;\
 648             if (Y2>255)   Y2=255;\
 649             else if (Y2<0)Y2=0;\
 650         }\
 651         acc+= acc + g[Y1+d128[(i+0)&7]];\
 652         acc+= acc + g[Y2+d128[(i+1)&7]];\
 653         if ((i&7)==6) {\
 654             ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 655             dest++;\
 656         }\
 657     }
 658
 659 #define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
 660     switch(c->dstFormat) {\
 661     case PIX_FMT_RGB48BE:\
 662     case PIX_FMT_RGB48LE:\
 663         func(uint8_t,0)\
 664             ((uint8_t*)dest)[ 0]= r[Y1];\
 665             ((uint8_t*)dest)[ 1]= r[Y1];\
 666             ((uint8_t*)dest)[ 2]= g[Y1];\
 667             ((uint8_t*)dest)[ 3]= g[Y1];\
 668             ((uint8_t*)dest)[ 4]= b[Y1];\
 669             ((uint8_t*)dest)[ 5]= b[Y1];\
 670             ((uint8_t*)dest)[ 6]= r[Y2];\
 671             ((uint8_t*)dest)[ 7]= r[Y2];\
 672             ((uint8_t*)dest)[ 8]= g[Y2];\
 673             ((uint8_t*)dest)[ 9]= g[Y2];\
 674             ((uint8_t*)dest)[10]= b[Y2];\
 675             ((uint8_t*)dest)[11]= b[Y2];\
 676             dest+=12;\
 677         }\
 678         break;\
 679     case PIX_FMT_BGR48BE:\
 680     case PIX_FMT_BGR48LE:\
 681         func(uint8_t,0)\
 682             ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
 683             ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
 684             ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
 685             ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
 686             ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
 687             ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
 688             dest+=12;\
 689         }\
 690         break;\
 691     case PIX_FMT_RGBA:\
 692     case PIX_FMT_BGRA:\
 693         if (CONFIG_SMALL) {\
 694             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 695             func(uint32_t,needAlpha)\
 696                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? (A1<<24) : 0);\
 697                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? (A2<<24) : 0);\
 698             }\
 699         } else {\
 700             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 701                 func(uint32_t,1)\
 702                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
 703                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
 704                 }\
 705             } else {\
 706                 func(uint32_t,0)\
 707                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 708                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 709                 }\
 710             }\
 711         }\
 712         break;\
 713     case PIX_FMT_ARGB:\
 714     case PIX_FMT_ABGR:\
 715         if (CONFIG_SMALL) {\
 716             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 717             func(uint32_t,needAlpha)\
 718                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? A1 : 0);\
 719                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? A2 : 0);\
 720             }\
 721         } else {\
 722             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 723                 func(uint32_t,1)\
 724                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + A1;\
 725                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + A2;\
 726                 }\
 727             } else {\
 728                 func(uint32_t,0)\
 729                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 730                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 731                 }\
 732             }\
 733         }                \
 734         break;\
 735     case PIX_FMT_RGB24:\
 736         func(uint8_t,0)\
 737             ((uint8_t*)dest)[0]= r[Y1];\
 738             ((uint8_t*)dest)[1]= g[Y1];\
 739             ((uint8_t*)dest)[2]= b[Y1];\
 740             ((uint8_t*)dest)[3]= r[Y2];\
 741             ((uint8_t*)dest)[4]= g[Y2];\
 742             ((uint8_t*)dest)[5]= b[Y2];\
 743             dest+=6;\
 744         }\
 745         break;\
 746     case PIX_FMT_BGR24:\
 747         func(uint8_t,0)\
 748             ((uint8_t*)dest)[0]= b[Y1];\
 749             ((uint8_t*)dest)[1]= g[Y1];\
 750             ((uint8_t*)dest)[2]= r[Y1];\
 751             ((uint8_t*)dest)[3]= b[Y2];\
 752             ((uint8_t*)dest)[4]= g[Y2];\
 753             ((uint8_t*)dest)[5]= r[Y2];\
 754             dest+=6;\
 755         }\
 756         break;\
 757     case PIX_FMT_RGB565BE:\
 758     case PIX_FMT_RGB565LE:\
 759     case PIX_FMT_BGR565BE:\
 760     case PIX_FMT_BGR565LE:\
 761         {\
 762             const int dr1= dither_2x2_8[y&1    ][0];\
 763             const int dg1= dither_2x2_4[y&1    ][0];\
 764             const int db1= dither_2x2_8[(y&1)^1][0];\
 765             const int dr2= dither_2x2_8[y&1    ][1];\
 766             const int dg2= dither_2x2_4[y&1    ][1];\
 767             const int db2= dither_2x2_8[(y&1)^1][1];\
 768             func(uint16_t,0)\
 769                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 770                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 771             }\
 772         }\
 773         break;\
 774     case PIX_FMT_RGB555BE:\
 775     case PIX_FMT_RGB555LE:\
 776     case PIX_FMT_BGR555BE:\
 777     case PIX_FMT_BGR555LE:\
 778         {\
 779             const int dr1= dither_2x2_8[y&1    ][0];\
 780             const int dg1= dither_2x2_8[y&1    ][1];\
 781             const int db1= dither_2x2_8[(y&1)^1][0];\
 782             const int dr2= dither_2x2_8[y&1    ][1];\
 783             const int dg2= dither_2x2_8[y&1    ][0];\
 784             const int db2= dither_2x2_8[(y&1)^1][1];\
 785             func(uint16_t,0)\
 786                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 787                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 788             }\
 789         }\
 790         break;\
 791     case PIX_FMT_RGB444BE:\
 792     case PIX_FMT_RGB444LE:\
 793     case PIX_FMT_BGR444BE:\
 794     case PIX_FMT_BGR444LE:\
 795         {\
 796             const int dr1= dither_4x4_16[y&3    ][0];\
 797             const int dg1= dither_4x4_16[y&3    ][1];\
 798             const int db1= dither_4x4_16[(y&3)^3][0];\
 799             const int dr2= dither_4x4_16[y&3    ][1];\
 800             const int dg2= dither_4x4_16[y&3    ][0];\
 801             const int db2= dither_4x4_16[(y&3)^3][1];\
 802             func(uint16_t,0)\
 803                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 804                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 805             }\
 806         }\
 807         break;\
 808     case PIX_FMT_RGB8:\
 809     case PIX_FMT_BGR8:\
 810         {\
 811             const uint8_t * const d64= dither_8x8_73[y&7];\
 812             const uint8_t * const d32= dither_8x8_32[y&7];\
 813             func(uint8_t,0)\
 814                 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
 815                 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
 816             }\
 817         }\
 818         break;\
 819     case PIX_FMT_RGB4:\
 820     case PIX_FMT_BGR4:\
 821         {\
 822             const uint8_t * const d64= dither_8x8_73 [y&7];\
 823             const uint8_t * const d128=dither_8x8_220[y&7];\
 824             func(uint8_t,0)\
 825                 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
 826                                  + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
 827             }\
 828         }\
 829         break;\
 830     case PIX_FMT_RGB4_BYTE:\
 831     case PIX_FMT_BGR4_BYTE:\
 832         {\
 833             const uint8_t * const d64= dither_8x8_73 [y&7];\
 834             const uint8_t * const d128=dither_8x8_220[y&7];\
 835             func(uint8_t,0)\
 836                 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
 837                 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
 838             }\
 839         }\
 840         break;\
 841     case PIX_FMT_MONOBLACK:\
 842     case PIX_FMT_MONOWHITE:\
 843         {\
 844             func_monoblack\
 845         }\
 846         break;\
 847     case PIX_FMT_YUYV422:\
 848         func2\
 849             ((uint8_t*)dest)[2*i2+0]= Y1;\
 850             ((uint8_t*)dest)[2*i2+1]= U;\
 851             ((uint8_t*)dest)[2*i2+2]= Y2;\
 852             ((uint8_t*)dest)[2*i2+3]= V;\
 853         }                \
 854         break;\
 855     case PIX_FMT_UYVY422:\
 856         func2\
 857             ((uint8_t*)dest)[2*i2+0]= U;\
 858             ((uint8_t*)dest)[2*i2+1]= Y1;\
 859             ((uint8_t*)dest)[2*i2+2]= V;\
 860             ((uint8_t*)dest)[2*i2+3]= Y2;\
 861         }                \
 862         break;\
 863     case PIX_FMT_GRAY16BE:\
 864         func_g16\
 865             ((uint8_t*)dest)[2*i2+0]= Y1>>8;\
 866             ((uint8_t*)dest)[2*i2+1]= Y1;\
 867             ((uint8_t*)dest)[2*i2+2]= Y2>>8;\
 868             ((uint8_t*)dest)[2*i2+3]= Y2;\
 869         }                \
 870         break;\
 871     case PIX_FMT_GRAY16LE:\
 872         func_g16\
 873             ((uint8_t*)dest)[2*i2+0]= Y1;\
 874             ((uint8_t*)dest)[2*i2+1]= Y1>>8;\
 875             ((uint8_t*)dest)[2*i2+2]= Y2;\
 876             ((uint8_t*)dest)[2*i2+3]= Y2>>8;\
 877         }                \
 878         break;\
 879     }
 880
 881 static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
 882                           const int16_t **lumSrc, int lumFilterSize,
 883                           const int16_t *chrFilter, const int16_t **chrUSrc,
 884                           const int16_t **chrVSrc, int chrFilterSize,
 885                           const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 886 {
 887     int i;
 888     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
 889 }
 890
 891 static inline void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
 892                                    const int16_t **lumSrc, int lumFilterSize,
 893                                    const int16_t *chrFilter, const int16_t **chrUSrc,
 894                                    const int16_t **chrVSrc, int chrFilterSize,
 895                                    const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 896 {
 897     int i;
 898     int step= c->dstFormatBpp/8;
 899     int aidx= 3;
 900
 901     switch(c->dstFormat) {
 902     case PIX_FMT_ARGB:
 903         dest++;
 904         aidx= 0;
 905     case PIX_FMT_RGB24:
 906         aidx--;
 907     case PIX_FMT_RGBA:
 908         if (CONFIG_SMALL) {
 909             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 910             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 911                 dest[aidx]= needAlpha ? A : 255;
 912                 dest[0]= R>>22;
 913                 dest[1]= G>>22;
 914                 dest[2]= B>>22;
 915                 dest+= step;
 916             }
 917         } else {
 918             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 919                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 920                     dest[aidx]= A;
 921                     dest[0]= R>>22;
 922                     dest[1]= G>>22;
 923                     dest[2]= B>>22;
 924                     dest+= step;
 925                 }
 926             } else {
 927                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 928                     dest[aidx]= 255;
 929                     dest[0]= R>>22;
 930                     dest[1]= G>>22;
 931                     dest[2]= B>>22;
 932                     dest+= step;
 933                 }
 934             }
 935         }
 936         break;
 937     case PIX_FMT_ABGR:
 938         dest++;
 939         aidx= 0;
 940     case PIX_FMT_BGR24:
 941         aidx--;
 942     case PIX_FMT_BGRA:
 943         if (CONFIG_SMALL) {
 944             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 945             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 946                 dest[aidx]= needAlpha ? A : 255;
 947                 dest[0]= B>>22;
 948                 dest[1]= G>>22;
 949                 dest[2]= R>>22;
 950                 dest+= step;
 951             }
 952         } else {
 953             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 954                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 955                     dest[aidx]= A;
 956                     dest[0]= B>>22;
 957                     dest[1]= G>>22;
 958                     dest[2]= R>>22;
 959                     dest+= step;
 960                 }
 961             } else {
 962                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 963                     dest[aidx]= 255;
 964                     dest[0]= B>>22;
 965                     dest[1]= G>>22;
 966                     dest[2]= R>>22;
 967                     dest+= step;
 968                 }
 969             }
 970         }
 971         break;
 972     default:
 973         assert(0);
 974     }
 975 }
 976
 977 static void fillPlane(uint8_t* plane, int stride, int width, int height, int y, uint8_t val)
 978 {
 979     int i;
 980     uint8_t *ptr = plane + stride*y;
 981     for (i=0; i<height; i++) {
 982         memset(ptr, val, width);
 983         ptr += stride;
 984     }
 985 }
 986
 987 static void rgb48ToY_c(uint8_t *dst, const uint8_t *src, int width,
 988                        uint32_t *unused)
 989 {
 990     int i;
 991     for (i = 0; i < width; i++) {
 992         int r = src[i*6+0];
 993         int g = src[i*6+2];
 994         int b = src[i*6+4];
 995
 996         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
 997     }
 998 }
 999
1000 static void rgb48ToUV_c(uint8_t *dstU, uint8_t *dstV,
1001                         const uint8_t *src1, const uint8_t *src2,
1002                         int width, uint32_t *unused)
1003 {
1004     int i;
1005     assert(src1==src2);
1006     for (i = 0; i < width; i++) {
1007         int r = src1[6*i + 0];
1008         int g = src1[6*i + 2];
1009         int b = src1[6*i + 4];
1010
1011         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1012         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1013     }
1014 }
1015
1016 static void rgb48ToUV_half_c(uint8_t *dstU, uint8_t *dstV,
1017                              const uint8_t *src1, const uint8_t *src2,
1018                              int width, uint32_t *unused)
1019 {
1020     int i;
1021     assert(src1==src2);
1022     for (i = 0; i < width; i++) {
1023         int r= src1[12*i + 0] + src1[12*i + 6];
1024         int g= src1[12*i + 2] + src1[12*i + 8];
1025         int b= src1[12*i + 4] + src1[12*i + 10];
1026
1027         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1028         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1029     }
1030 }
1031
1032 static void bgr48ToY_c(uint8_t *dst, const uint8_t *src, int width,
1033                        uint32_t *unused)
1034 {
1035     int i;
1036     for (i = 0; i < width; i++) {
1037         int b = src[i*6+0];
1038         int g = src[i*6+2];
1039         int r = src[i*6+4];
1040
1041         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1042     }
1043 }
1044
1045 static void bgr48ToUV_c(uint8_t *dstU, uint8_t *dstV,
1046                         const uint8_t *src1, const uint8_t *src2,
1047                         int width, uint32_t *unused)
1048 {
1049     int i;
1050     for (i = 0; i < width; i++) {
1051         int b = src1[6*i + 0];
1052         int g = src1[6*i + 2];
1053         int r = src1[6*i + 4];
1054
1055         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1056         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1057     }
1058 }
1059
1060 static void bgr48ToUV_half_c(uint8_t *dstU, uint8_t *dstV,
1061                              const uint8_t *src1, const uint8_t *src2,
1062                              int width, uint32_t *unused)
1063 {
1064     int i;
1065     for (i = 0; i < width; i++) {
1066         int b= src1[12*i + 0] + src1[12*i + 6];
1067         int g= src1[12*i + 2] + src1[12*i + 8];
1068         int r= src1[12*i + 4] + src1[12*i + 10];
1069
1070         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1071         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1072     }
1073 }
1074
1075 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1076 static void name ## _c(uint8_t *dst, const uint8_t *src, \
1077                        int width, uint32_t *unused)\
1078 {\
1079     int i;\
1080     for (i=0; i<width; i++) {\
1081         int b= (((const type*)src)[i]>>shb)&maskb;\
1082         int g= (((const type*)src)[i]>>shg)&maskg;\
1083         int r= (((const type*)src)[i]>>shr)&maskr;\
1084 \
1085         dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1086     }\
1087 }
1088
1089 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1090 BGR2Y(uint32_t,bgr321ToY,16,16, 0, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1091 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1092 BGR2Y(uint32_t,rgb321ToY, 0,16,16, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1093 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1094 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1095 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1096 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1097
1098 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1099 {
1100     int i;
1101     for (i=0; i<width; i++) {
1102         dst[i]= src[4*i];
1103     }
1104 }
1105
1106 #define BGR2UV(type, name, shr, shg, shb, shp, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S) \
1107 static void name ## _c(uint8_t *dstU, uint8_t *dstV, \
1108                        const uint8_t *src, const uint8_t *dummy, \
1109                        int width, uint32_t *unused)\
1110 {\
1111     int i;\
1112     for (i=0; i<width; i++) {\
1113         int b= ((((const type*)src)[i]>>shp)&maskb)>>shb;\
1114         int g= ((((const type*)src)[i]>>shp)&maskg)>>shg;\
1115         int r= ((((const type*)src)[i]>>shp)&maskr)>>shr;\
1116 \
1117         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1118         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1119     }\
1120 }\
1121 static void name ## _half_c(uint8_t *dstU, uint8_t *dstV, \
1122                             const uint8_t *src, const uint8_t *dummy, \
1123                             int width, uint32_t *unused)\
1124 {\
1125     int i;\
1126     for (i=0; i<width; i++) {\
1127         int pix0= ((const type*)src)[2*i+0]>>shp;\
1128         int pix1= ((const type*)src)[2*i+1]>>shp;\
1129         int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1130         int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1131         int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1132         g&= maskg|(2*maskg);\
1133 \
1134         g>>=shg;\
1135 \
1136         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1137         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1138     }\
1139 }
1140
1141 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1142 BGR2UV(uint32_t,bgr321ToUV,16, 0, 0, 8, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1143 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1144 BGR2UV(uint32_t,rgb321ToUV, 0, 0,16, 8,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1145 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1146 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1147 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1148 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1149
1150 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1151 {
1152     int i;
1153     for (i=0; i<width; i++) {
1154         int d= src[i];
1155
1156         dst[i]= pal[d] & 0xFF;
1157     }
1158 }
1159
1160 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1161                       const uint8_t *src1, const uint8_t *src2,
1162                       int width, uint32_t *pal)
1163 {
1164     int i;
1165     assert(src1 == src2);
1166     for (i=0; i<width; i++) {
1167         int p= pal[src1[i]];
1168
1169         dstU[i]= p>>8;
1170         dstV[i]= p>>16;
1171     }
1172 }
1173
1174 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1175                           int width, uint32_t *unused)
1176 {
1177     int i, j;
1178     for (i=0; i<width/8; i++) {
1179         int d= ~src[i];
1180         for(j=0; j<8; j++)
1181             dst[8*i+j]= ((d>>(7-j))&1)*255;
1182     }
1183 }
1184
1185 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1186                           int width, uint32_t *unused)
1187 {
1188     int i, j;
1189     for (i=0; i<width/8; i++) {
1190         int d= src[i];
1191         for(j=0; j<8; j++)
1192             dst[8*i+j]= ((d>>(7-j))&1)*255;
1193     }
1194 }
1195
1196 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
1197                        const int16_t *chrUSrc, const int16_t *chrVSrc,
1198                        const int16_t *alpSrc,
1199                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
1200                        uint8_t *aDest, int dstW, int chrDstW)
1201 {
1202     int i;
1203     for (i=0; i<dstW; i++) {
1204         int val= (lumSrc[i]+64)>>7;
1205         dest[i]= av_clip_uint8(val);
1206     }
1207
1208     if (uDest)
1209         for (i=0; i<chrDstW; i++) {
1210             int u=(chrUSrc[i]+64)>>7;
1211             int v=(chrVSrc[i]+64)>>7;
1212             uDest[i]= av_clip_uint8(u);
1213             vDest[i]= av_clip_uint8(v);
1214         }
1215
1216     if (CONFIG_SWSCALE_ALPHA && aDest)
1217         for (i=0; i<dstW; i++) {
1218             int val= (alpSrc[i]+64)>>7;
1219             aDest[i]= av_clip_uint8(val);
1220         }
1221 }
1222
1223 /**
1224  * vertical bilinear scale YV12 to RGB
1225  */
1226 static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
1227                           const uint16_t *buf1, const uint16_t *ubuf0,
1228                           const uint16_t *ubuf1, const uint16_t *vbuf0,
1229                           const uint16_t *vbuf1, const uint16_t *abuf0,
1230                           const uint16_t *abuf1, uint8_t *dest, int dstW,
1231                           int yalpha, int uvalpha, int y)
1232 {
1233     int  yalpha1=4095- yalpha;
1234     int uvalpha1=4095-uvalpha;
1235     int i;
1236
1237     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1238 }
1239
1240 /**
1241  * YV12 to RGB without scaling or interpolating
1242  */
1243 static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
1244                           const uint16_t *ubuf0, const uint16_t *ubuf1,
1245                           const uint16_t *vbuf0, const uint16_t *vbuf1,
1246                           const uint16_t *abuf0, uint8_t *dest, int dstW,
1247                           int uvalpha, enum PixelFormat dstFormat,
1248                           int flags, int y)
1249 {
1250     const int yalpha1=0;
1251     int i;
1252
1253     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1254     const int yalpha= 4096; //FIXME ...
1255
1256     if (uvalpha < 2048) {
1257         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1258     } else {
1259         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1260     }
1261 }
1262
1263 //FIXME yuy2* can read up to 7 samples too much
1264
1265 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1266                       uint32_t *unused)
1267 {
1268     int i;
1269     for (i=0; i<width; i++)
1270         dst[i]= src[2*i];
1271 }
1272
1273 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1274                        const uint8_t *src2, int width, uint32_t *unused)
1275 {
1276     int i;
1277     for (i=0; i<width; i++) {
1278         dstU[i]= src1[4*i + 1];
1279         dstV[i]= src1[4*i + 3];
1280     }
1281     assert(src1 == src2);
1282 }
1283
1284 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1285                      const uint8_t *src2, int width, uint32_t *unused)
1286 {
1287     int i;
1288     for (i=0; i<width; i++) {
1289         dstU[i]= src1[2*i + 1];
1290         dstV[i]= src2[2*i + 1];
1291     }
1292 }
1293
1294 /* This is almost identical to the previous, end exists only because
1295  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1296 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1297                       uint32_t *unused)
1298 {
1299     int i;
1300     for (i=0; i<width; i++)
1301         dst[i]= src[2*i+1];
1302 }
1303
1304 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1305                        const uint8_t *src2, int width, uint32_t *unused)
1306 {
1307     int i;
1308     for (i=0; i<width; i++) {
1309         dstU[i]= src1[4*i + 0];
1310         dstV[i]= src1[4*i + 2];
1311     }
1312     assert(src1 == src2);
1313 }
1314
1315 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1316                      const uint8_t *src2, int width, uint32_t *unused)
1317 {
1318     int i;
1319     for (i=0; i<width; i++) {
1320         dstU[i]= src1[2*i];
1321         dstV[i]= src2[2*i];
1322     }
1323 }
1324
1325 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1326                                         const uint8_t *src, int width)
1327 {
1328     int i;
1329     for (i = 0; i < width; i++) {
1330         dst1[i] = src[2*i+0];
1331         dst2[i] = src[2*i+1];
1332     }
1333 }
1334
1335 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1336                        const uint8_t *src1, const uint8_t *src2,
1337                        int width, uint32_t *unused)
1338 {
1339     nvXXtoUV_c(dstU, dstV, src1, width);
1340 }
1341
1342 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1343                        const uint8_t *src1, const uint8_t *src2,
1344                        int width, uint32_t *unused)
1345 {
1346     nvXXtoUV_c(dstV, dstU, src1, width);
1347 }
1348
1349 // FIXME Maybe dither instead.
1350 #define YUV_NBPS(depth, endianness, rfunc) \
1351 static void endianness ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1352                                           const uint8_t *_srcU, const uint8_t *_srcV, \
1353                                           int width, uint32_t *unused) \
1354 { \
1355     int i; \
1356     const uint16_t *srcU = (const uint16_t*)_srcU; \
1357     const uint16_t *srcV = (const uint16_t*)_srcV; \
1358     for (i = 0; i < width; i++) { \
1359         dstU[i] = rfunc(&srcU[i])>>(depth-8); \
1360         dstV[i] = rfunc(&srcV[i])>>(depth-8); \
1361     } \
1362 } \
1363 \
1364 static void endianness ## depth ## ToY_c(uint8_t *dstY, const uint8_t *_srcY, \
1365                                          int width, uint32_t *unused) \
1366 { \
1367     int i; \
1368     const uint16_t *srcY = (const uint16_t*)_srcY; \
1369     for (i = 0; i < width; i++) \
1370         dstY[i] = rfunc(&srcY[i])>>(depth-8); \
1371 } \
1372
1373 YUV_NBPS( 9, LE, AV_RL16)
1374 YUV_NBPS( 9, BE, AV_RB16)
1375 YUV_NBPS(10, LE, AV_RL16)
1376 YUV_NBPS(10, BE, AV_RB16)
1377
1378 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1379                        int width, uint32_t *unused)
1380 {
1381     int i;
1382     for (i=0; i<width; i++) {
1383         int b= src[i*3+0];
1384         int g= src[i*3+1];
1385         int r= src[i*3+2];
1386
1387         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1388     }
1389 }
1390
1391 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1392                         const uint8_t *src2, int width, uint32_t *unused)
1393 {
1394     int i;
1395     for (i=0; i<width; i++) {
1396         int b= src1[3*i + 0];
1397         int g= src1[3*i + 1];
1398         int r= src1[3*i + 2];
1399
1400         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1401         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1402     }
1403     assert(src1 == src2);
1404 }
1405
1406 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1407                              const uint8_t *src2, int width, uint32_t *unused)
1408 {
1409     int i;
1410     for (i=0; i<width; i++) {
1411         int b= src1[6*i + 0] + src1[6*i + 3];
1412         int g= src1[6*i + 1] + src1[6*i + 4];
1413         int r= src1[6*i + 2] + src1[6*i + 5];
1414
1415         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1416         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1417     }
1418     assert(src1 == src2);
1419 }
1420
1421 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1422                        uint32_t *unused)
1423 {
1424     int i;
1425     for (i=0; i<width; i++) {
1426         int r= src[i*3+0];
1427         int g= src[i*3+1];
1428         int b= src[i*3+2];
1429
1430         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1431     }
1432 }
1433
1434 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1435                         const uint8_t *src2, int width, uint32_t *unused)
1436 {
1437     int i;
1438     assert(src1==src2);
1439     for (i=0; i<width; i++) {
1440         int r= src1[3*i + 0];
1441         int g= src1[3*i + 1];
1442         int b= src1[3*i + 2];
1443
1444         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1445         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1446     }
1447 }
1448
1449 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1450                              const uint8_t *src2, int width, uint32_t *unused)
1451 {
1452     int i;
1453     assert(src1==src2);
1454     for (i=0; i<width; i++) {
1455         int r= src1[6*i + 0] + src1[6*i + 3];
1456         int g= src1[6*i + 1] + src1[6*i + 4];
1457         int b= src1[6*i + 2] + src1[6*i + 5];
1458
1459         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1460         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1461     }
1462 }
1463
1464
1465 // bilinear / bicubic scaling
1466 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1467                      int srcW, int xInc,
1468                      const int16_t *filter, const int16_t *filterPos,
1469                      int filterSize)
1470 {
1471     int i;
1472     for (i=0; i<dstW; i++) {
1473         int j;
1474         int srcPos= filterPos[i];
1475         int val=0;
1476         for (j=0; j<filterSize; j++) {
1477             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1478         }
1479         //filter += hFilterSize;
1480         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1481         //dst[i] = val>>7;
1482     }
1483 }
1484
1485 //FIXME all pal and rgb srcFormats could do this convertion as well
1486 //FIXME all scalers more complex than bilinear could do half of this transform
1487 static void chrRangeToJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1488 {
1489     int i;
1490     for (i = 0; i < width; i++) {
1491         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1492         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1493     }
1494 }
1495 static void chrRangeFromJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1496 {
1497     int i;
1498     for (i = 0; i < width; i++) {
1499         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1500         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1501     }
1502 }
1503 static void lumRangeToJpeg_c(uint16_t *dst, int width)
1504 {
1505     int i;
1506     for (i = 0; i < width; i++)
1507         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1508 }
1509 static void lumRangeFromJpeg_c(uint16_t *dst, int width)
1510 {
1511     int i;
1512     for (i = 0; i < width; i++)
1513         dst[i] = (dst[i]*14071 + 33561947)>>14;
1514 }
1515
1516 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1517                            const uint8_t *src, int srcW, int xInc)
1518 {
1519     int i;
1520     unsigned int xpos=0;
1521     for (i=0;i<dstWidth;i++) {
1522         register unsigned int xx=xpos>>16;
1523         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1524         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1525         xpos+=xInc;
1526     }
1527 }
1528
1529 // *** horizontal scale Y line to temp buffer
1530 static inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1531                            const uint8_t *src, int srcW, int xInc,
1532                            const int16_t *hLumFilter,
1533                            const int16_t *hLumFilterPos, int hLumFilterSize,
1534                            uint8_t *formatConvBuffer,
1535                            uint32_t *pal, int isAlpha)
1536 {
1537     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1538     void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1539
1540     src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
1541
1542     if (toYV12) {
1543         toYV12(formatConvBuffer, src, srcW, pal);
1544         src= formatConvBuffer;
1545     }
1546
1547     if (!c->hyscale_fast) {
1548         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1549     } else { // fast bilinear upscale / crap downscale
1550         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1551     }
1552
1553     if (convertRange)
1554         convertRange(dst, dstWidth);
1555 }
1556
1557 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1558                            int dstWidth, const uint8_t *src1,
1559                            const uint8_t *src2, int srcW, int xInc)
1560 {
1561     int i;
1562     unsigned int xpos=0;
1563     for (i=0;i<dstWidth;i++) {
1564         register unsigned int xx=xpos>>16;
1565         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1566         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1567         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1568         xpos+=xInc;
1569     }
1570 }
1571
1572 static inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1573                            const uint8_t *src1, const uint8_t *src2,
1574                            int srcW, int xInc, const int16_t *hChrFilter,
1575                            const int16_t *hChrFilterPos, int hChrFilterSize,
1576                            uint8_t *formatConvBuffer, uint32_t *pal)
1577 {
1578
1579     src1 += c->chrSrcOffset;
1580     src2 += c->chrSrcOffset;
1581
1582     if (c->chrToYV12) {
1583         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1584         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1585         src1= formatConvBuffer;
1586         src2= buf2;
1587     }
1588
1589     if (!c->hcscale_fast) {
1590         c->hScale(dst1, dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1591         c->hScale(dst2, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1592     } else { // fast bilinear upscale / crap downscale
1593         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1594     }
1595
1596     if (c->chrConvertRange)
1597         c->chrConvertRange(dst1, dst2, dstWidth);
1598 }
1599
1600 #define DEBUG_SWSCALE_BUFFERS 0
1601 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
1602
1603 static int swScale(SwsContext *c, const uint8_t* src[],
1604                    int srcStride[], int srcSliceY,
1605                    int srcSliceH, uint8_t* dst[], int dstStride[])
1606 {
1607     /* load a few things into local vars to make the code more readable? and faster */
1608     const int srcW= c->srcW;
1609     const int dstW= c->dstW;
1610     const int dstH= c->dstH;
1611     const int chrDstW= c->chrDstW;
1612     const int chrSrcW= c->chrSrcW;
1613     const int lumXInc= c->lumXInc;
1614     const int chrXInc= c->chrXInc;
1615     const enum PixelFormat dstFormat= c->dstFormat;
1616     const int flags= c->flags;
1617     int16_t *vLumFilterPos= c->vLumFilterPos;
1618     int16_t *vChrFilterPos= c->vChrFilterPos;
1619     int16_t *hLumFilterPos= c->hLumFilterPos;
1620     int16_t *hChrFilterPos= c->hChrFilterPos;
1621     int16_t *vLumFilter= c->vLumFilter;
1622     int16_t *vChrFilter= c->vChrFilter;
1623     int16_t *hLumFilter= c->hLumFilter;
1624     int16_t *hChrFilter= c->hChrFilter;
1625     int32_t *lumMmxFilter= c->lumMmxFilter;
1626     int32_t *chrMmxFilter= c->chrMmxFilter;
1627     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
1628     const int vLumFilterSize= c->vLumFilterSize;
1629     const int vChrFilterSize= c->vChrFilterSize;
1630     const int hLumFilterSize= c->hLumFilterSize;
1631     const int hChrFilterSize= c->hChrFilterSize;
1632     int16_t **lumPixBuf= c->lumPixBuf;
1633     int16_t **chrUPixBuf= c->chrUPixBuf;
1634     int16_t **chrVPixBuf= c->chrVPixBuf;
1635     int16_t **alpPixBuf= c->alpPixBuf;
1636     const int vLumBufSize= c->vLumBufSize;
1637     const int vChrBufSize= c->vChrBufSize;
1638     uint8_t *formatConvBuffer= c->formatConvBuffer;
1639     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
1640     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
1641     int lastDstY;
1642     uint32_t *pal=c->pal_yuv;
1643
1644     /* vars which will change and which we need to store back in the context */
1645     int dstY= c->dstY;
1646     int lumBufIndex= c->lumBufIndex;
1647     int chrBufIndex= c->chrBufIndex;
1648     int lastInLumBuf= c->lastInLumBuf;
1649     int lastInChrBuf= c->lastInChrBuf;
1650
1651     if (isPacked(c->srcFormat)) {
1652         src[0]=
1653         src[1]=
1654         src[2]=
1655         src[3]= src[0];
1656         srcStride[0]=
1657         srcStride[1]=
1658         srcStride[2]=
1659         srcStride[3]= srcStride[0];
1660     }
1661     srcStride[1]<<= c->vChrDrop;
1662     srcStride[2]<<= c->vChrDrop;
1663
1664     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
1665                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
1666                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
1667     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
1668                    srcSliceY,    srcSliceH,    dstY,    dstH);
1669     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
1670                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
1671
1672     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
1673         static int warnedAlready=0; //FIXME move this into the context perhaps
1674         if (flags & SWS_PRINT_INFO && !warnedAlready) {
1675             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
1676                    "         ->cannot do aligned memory accesses anymore\n");
1677             warnedAlready=1;
1678         }
1679     }
1680
1681     /* Note the user might start scaling the picture in the middle so this
1682        will not get executed. This is not really intended but works
1683        currently, so people might do it. */
1684     if (srcSliceY ==0) {
1685         lumBufIndex=-1;
1686         chrBufIndex=-1;
1687         dstY=0;
1688         lastInLumBuf= -1;
1689         lastInChrBuf= -1;
1690     }
1691
1692     lastDstY= dstY;
1693
1694     for (;dstY < dstH; dstY++) {
1695         unsigned char *dest =dst[0]+dstStride[0]*dstY;
1696         const int chrDstY= dstY>>c->chrDstVSubSample;
1697         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
1698         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
1699         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
1700
1701         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
1702         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
1703         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
1704         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
1705         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
1706         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
1707         int enough_lines;
1708
1709         //handle holes (FAST_BILINEAR & weird filters)
1710         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1711         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1712         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
1713         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
1714
1715         DEBUG_BUFFERS("dstY: %d\n", dstY);
1716         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
1717                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
1718         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
1719                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
1720
1721         // Do we have enough lines in this slice to output the dstY line
1722         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
1723
1724         if (!enough_lines) {
1725             lastLumSrcY = srcSliceY + srcSliceH - 1;
1726             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
1727             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
1728                                             lastLumSrcY, lastChrSrcY);
1729         }
1730
1731         //Do horizontal scaling
1732         while(lastInLumBuf < lastLumSrcY) {
1733             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
1734             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
1735             lumBufIndex++;
1736             assert(lumBufIndex < 2*vLumBufSize);
1737             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
1738             assert(lastInLumBuf + 1 - srcSliceY >= 0);
1739             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
1740                     hLumFilter, hLumFilterPos, hLumFilterSize,
1741                     formatConvBuffer,
1742                     pal, 0);
1743             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
1744                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
1745                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
1746                         formatConvBuffer,
1747                         pal, 1);
1748             lastInLumBuf++;
1749             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
1750                                lumBufIndex,    lastInLumBuf);
1751         }
1752         while(lastInChrBuf < lastChrSrcY) {
1753             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
1754             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
1755             chrBufIndex++;
1756             assert(chrBufIndex < 2*vChrBufSize);
1757             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
1758             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
1759             //FIXME replace parameters through context struct (some at least)
1760
1761             if (c->needs_hcscale)
1762                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
1763                           chrDstW, src1, src2, chrSrcW, chrXInc,
1764                           hChrFilter, hChrFilterPos, hChrFilterSize,
1765                           formatConvBuffer, pal);
1766             lastInChrBuf++;
1767             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
1768                                chrBufIndex,    lastInChrBuf);
1769         }
1770         //wrap buf index around to stay inside the ring buffer
1771         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
1772         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
1773         if (!enough_lines)
1774             break; //we can't output a dstY line so let's try with the next slice
1775
1776 #if HAVE_MMX
1777         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
1778 #endif
1779         if (dstY < dstH-2) {
1780             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1781             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1782             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1783             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1784             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1785                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1786                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
1787                 c->yuv2nv12X(c,
1788                              vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1789                              vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1790                              dest, uDest, dstW, chrDstW, dstFormat);
1791             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
1792                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1793                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1794                 if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
1795                     yuv2yuvX16_c(vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1796                                  vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
1797                                  chrVSrcPtr, vChrFilterSize,
1798                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest,
1799                                  (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
1800                                  dstFormat);
1801                 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
1802                     const int16_t *lumBuf = lumSrcPtr[0];
1803                     const int16_t *chrUBuf= chrUSrcPtr[0];
1804                     const int16_t *chrVBuf= chrVSrcPtr[0];
1805                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
1806                     c->yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
1807                                 uDest, vDest, aDest, dstW, chrDstW);
1808                 } else { //General YV12
1809                     c->yuv2yuvX(c,
1810                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1811                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
1812                                 chrVSrcPtr, vChrFilterSize,
1813                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
1814                 }
1815             } else {
1816                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
1817                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1818                 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
1819                     int chrAlpha= vChrFilter[2*dstY+1];
1820                     if(flags & SWS_FULL_CHR_H_INT) {
1821                         yuv2rgbX_c_full(c, //FIXME write a packed1_full function
1822                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1823                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr,
1824                                         chrVSrcPtr, vChrFilterSize,
1825                                         alpSrcPtr, dest, dstW, dstY);
1826                     } else {
1827                         c->yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
1828                                        *chrVSrcPtr, *(chrVSrcPtr+1),
1829                                        alpPixBuf ? *alpSrcPtr : NULL,
1830                                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
1831                     }
1832                 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
1833                     int lumAlpha= vLumFilter[2*dstY+1];
1834                     int chrAlpha= vChrFilter[2*dstY+1];
1835                     lumMmxFilter[2]=
1836                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
1837                     chrMmxFilter[2]=
1838                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
1839                     if(flags & SWS_FULL_CHR_H_INT) {
1840                         yuv2rgbX_c_full(c, //FIXME write a packed2_full function
1841                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1842                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1843                                         alpSrcPtr, dest, dstW, dstY);
1844                     } else {
1845                         c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
1846                                        *chrVSrcPtr, *(chrVSrcPtr+1),
1847                                        alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
1848                                        dest, dstW, lumAlpha, chrAlpha, dstY);
1849                     }
1850                 } else { //general RGB
1851                     if(flags & SWS_FULL_CHR_H_INT) {
1852                         yuv2rgbX_c_full(c,
1853                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1854                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1855                                         alpSrcPtr, dest, dstW, dstY);
1856                     } else {
1857                         c->yuv2packedX(c,
1858                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1859                                        vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1860                                        alpSrcPtr, dest, dstW, dstY);
1861                     }
1862                 }
1863             }
1864         } else { // hmm looks like we can't use MMX here without overwriting this array's tail
1865             const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1866             const int16_t **chrUSrcPtr= (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1867             const int16_t **chrVSrcPtr= (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1868             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1869             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1870                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1871                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
1872                 yuv2nv12X_c(c, vLumFilter+dstY*vLumFilterSize,
1873                             lumSrcPtr, vLumFilterSize,
1874                             vChrFilter+chrDstY*vChrFilterSize,
1875                             chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1876                             dest, uDest, dstW, chrDstW, dstFormat);
1877             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
1878                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1879                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1880                 if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
1881                     yuv2yuvX16_c(vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1882                                  vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1883                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
1884                                  dstFormat);
1885                 } else {
1886                     yuv2yuvX_c(c, vLumFilter+dstY*vLumFilterSize,
1887                                lumSrcPtr, vLumFilterSize,
1888                                vChrFilter+chrDstY*vChrFilterSize,
1889                                chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1890                                alpSrcPtr, dest, uDest, vDest, aDest,
1891                                dstW, chrDstW);
1892                 }
1893             } else {
1894                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
1895                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1896                 if(flags & SWS_FULL_CHR_H_INT) {
1897                     yuv2rgbX_c_full(c,
1898                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1899                                     vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1900                                     alpSrcPtr, dest, dstW, dstY);
1901                 } else {
1902                     yuv2packedX_c(c,
1903                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1904                                   vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1905                                   alpSrcPtr, dest, dstW, dstY);
1906                 }
1907             }
1908         }
1909     }
1910
1911     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
1912         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
1913
1914 #if HAVE_MMX2
1915     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
1916         __asm__ volatile("sfence":::"memory");
1917 #endif
1918     emms_c();
1919
1920     /* store changed local vars back in the context */
1921     c->dstY= dstY;
1922     c->lumBufIndex= lumBufIndex;
1923     c->chrBufIndex= chrBufIndex;
1924     c->lastInLumBuf= lastInLumBuf;
1925     c->lastInChrBuf= lastInChrBuf;
1926
1927     return dstY - lastDstY;
1928 }
1929
1930 static void sws_init_swScale_c(SwsContext *c)
1931 {
1932     enum PixelFormat srcFormat = c->srcFormat;
1933
1934     c->yuv2nv12X    = yuv2nv12X_c;
1935     c->yuv2yuv1     = yuv2yuv1_c;
1936     c->yuv2yuvX     = yuv2yuvX_c;
1937     c->yuv2packed1  = yuv2packed1_c;
1938     c->yuv2packed2  = yuv2packed2_c;
1939     c->yuv2packedX  = yuv2packedX_c;
1940
1941     c->hScale       = hScale_c;
1942
1943     if (c->flags & SWS_FAST_BILINEAR)
1944     {
1945         c->hyscale_fast = hyscale_fast_c;
1946         c->hcscale_fast = hcscale_fast_c;
1947     }
1948
1949     c->chrToYV12 = NULL;
1950     switch(srcFormat) {
1951         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
1952         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
1953         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
1954         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
1955         case PIX_FMT_RGB8     :
1956         case PIX_FMT_BGR8     :
1957         case PIX_FMT_PAL8     :
1958         case PIX_FMT_BGR4_BYTE:
1959         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
1960         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
1961         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
1962         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
1963         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
1964         case PIX_FMT_YUV420P16BE:
1965         case PIX_FMT_YUV422P16BE:
1966         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
1967         case PIX_FMT_YUV420P16LE:
1968         case PIX_FMT_YUV422P16LE:
1969         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
1970     }
1971     if (c->chrSrcHSubSample) {
1972         switch(srcFormat) {
1973         case PIX_FMT_RGB48BE:
1974         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half_c; break;
1975         case PIX_FMT_BGR48BE:
1976         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half_c; break;
1977         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half_c;  break;
1978         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half_c; break;
1979         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_half_c; break;
1980         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half_c; break;
1981         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half_c; break;
1982         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half_c;  break;
1983         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half_c; break;
1984         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_half_c; break;
1985         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half_c; break;
1986         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half_c; break;
1987         }
1988     } else {
1989         switch(srcFormat) {
1990         case PIX_FMT_RGB48BE:
1991         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_c; break;
1992         case PIX_FMT_BGR48BE:
1993         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_c; break;
1994         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_c;  break;
1995         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_c; break;
1996         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_c; break;
1997         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_c; break;
1998         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_c; break;
1999         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_c;  break;
2000         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_c; break;
2001         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_c; break;
2002         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_c; break;
2003         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_c; break;
2004         }
2005     }
2006
2007     c->lumToYV12 = NULL;
2008     c->alpToYV12 = NULL;
2009     switch (srcFormat) {
2010     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
2011     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
2012     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
2013     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
2014     case PIX_FMT_YUYV422  :
2015     case PIX_FMT_YUV420P16BE:
2016     case PIX_FMT_YUV422P16BE:
2017     case PIX_FMT_YUV444P16BE:
2018     case PIX_FMT_Y400A    :
2019     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
2020     case PIX_FMT_UYVY422  :
2021     case PIX_FMT_YUV420P16LE:
2022     case PIX_FMT_YUV422P16LE:
2023     case PIX_FMT_YUV444P16LE:
2024     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c; break;
2025     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c; break;
2026     case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY_c; break;
2027     case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY_c; break;
2028     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c; break;
2029     case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY_c; break;
2030     case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY_c; break;
2031     case PIX_FMT_RGB8     :
2032     case PIX_FMT_BGR8     :
2033     case PIX_FMT_PAL8     :
2034     case PIX_FMT_BGR4_BYTE:
2035     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2036     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2037     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2038     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2039     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2040     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2041     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2042     case PIX_FMT_RGB48BE:
2043     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY_c; break;
2044     case PIX_FMT_BGR48BE:
2045     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY_c; break;
2046     }
2047     if (c->alpPixBuf) {
2048         switch (srcFormat) {
2049         case PIX_FMT_RGB32  :
2050         case PIX_FMT_RGB32_1:
2051         case PIX_FMT_BGR32  :
2052         case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA_c; break;
2053         case PIX_FMT_Y400A  : c->alpToYV12 = yuy2ToY_c; break;
2054         }
2055     }
2056
2057     switch (srcFormat) {
2058     case PIX_FMT_Y400A  :
2059         c->alpSrcOffset = 1;
2060         break;
2061     case PIX_FMT_RGB32  :
2062     case PIX_FMT_BGR32  :
2063         c->alpSrcOffset = 3;
2064         break;
2065     case PIX_FMT_RGB48LE:
2066     case PIX_FMT_BGR48LE:
2067         c->lumSrcOffset = 1;
2068         c->chrSrcOffset = 1;
2069         c->alpSrcOffset = 1;
2070         break;
2071     }
2072
2073     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2074         if (c->srcRange) {
2075             c->lumConvertRange = lumRangeFromJpeg_c;
2076             c->chrConvertRange = chrRangeFromJpeg_c;
2077         } else {
2078             c->lumConvertRange = lumRangeToJpeg_c;
2079             c->chrConvertRange = chrRangeToJpeg_c;
2080         }
2081     }
2082
2083     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2084           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2085         c->needs_hcscale = 1;
2086 }
2087
2088 SwsFunc ff_getSwsFunc(SwsContext *c)
2089 {
2090     sws_init_swScale_c(c);
2091
2092     if (HAVE_MMX)
2093         ff_sws_init_swScale_mmx(c);
2094     if (HAVE_ALTIVEC)
2095         ff_sws_init_swScale_altivec(c);
2096
2097     return swScale;
2098 }
2099
2100 static void copyPlane(const uint8_t *src, int srcStride,
2101                       int srcSliceY, int srcSliceH, int width,
2102                       uint8_t *dst, int dstStride)
2103 {
2104     dst += dstStride * srcSliceY;
2105     if (dstStride == srcStride && srcStride > 0) {
2106         memcpy(dst, src, srcSliceH * dstStride);
2107     } else {
2108         int i;
2109         for (i=0; i<srcSliceH; i++) {
2110             memcpy(dst, src, width);
2111             src += srcStride;
2112             dst += dstStride;
2113         }
2114     }
2115 }
2116
2117 static int planarToNv12Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2118                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2119 {
2120     uint8_t *dst = dstParam[1] + dstStride[1]*srcSliceY/2;
2121
2122     copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
2123               dstParam[0], dstStride[0]);
2124
2125     if (c->dstFormat == PIX_FMT_NV12)
2126         interleaveBytes(src[1], src[2], dst, c->srcW/2, srcSliceH/2, srcStride[1], srcStride[2], dstStride[0]);
2127     else
2128         interleaveBytes(src[2], src[1], dst, c->srcW/2, srcSliceH/2, srcStride[2], srcStride[1], dstStride[0]);
2129
2130     return srcSliceH;
2131 }
2132
2133 static int planarToYuy2Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2134                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2135 {
2136     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
2137
2138     yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
2139
2140     return srcSliceH;
2141 }
2142
2143 static int planarToUyvyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2144                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2145 {
2146     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
2147
2148     yv12touyvy(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
2149
2150     return srcSliceH;
2151 }
2152
2153 static int yuv422pToYuy2Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2154                                 int srcSliceH, uint8_t* dstParam[], int dstStride[])
2155 {
2156     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
2157
2158     yuv422ptoyuy2(src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0]);
2159
2160     return srcSliceH;
2161 }
2162
2163 static int yuv422pToUyvyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2164                                 int srcSliceH, uint8_t* dstParam[], int dstStride[])
2165 {
2166     uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
2167
2168     yuv422ptouyvy(src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0]);
2169
2170     return srcSliceH;
2171 }
2172
2173 static int yuyvToYuv420Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2174                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2175 {
2176     uint8_t *ydst=dstParam[0] + dstStride[0]*srcSliceY;
2177     uint8_t *udst=dstParam[1] + dstStride[1]*srcSliceY/2;
2178     uint8_t *vdst=dstParam[2] + dstStride[2]*srcSliceY/2;
2179
2180     yuyvtoyuv420(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
2181
2182     if (dstParam[3])
2183         fillPlane(dstParam[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
2184
2185     return srcSliceH;
2186 }
2187
2188 static int yuyvToYuv422Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2189                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2190 {
2191     uint8_t *ydst=dstParam[0] + dstStride[0]*srcSliceY;
2192     uint8_t *udst=dstParam[1] + dstStride[1]*srcSliceY;
2193     uint8_t *vdst=dstParam[2] + dstStride[2]*srcSliceY;
2194
2195     yuyvtoyuv422(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
2196
2197     return srcSliceH;
2198 }
2199
2200 static int uyvyToYuv420Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2201                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2202 {
2203     uint8_t *ydst=dstParam[0] + dstStride[0]*srcSliceY;
2204     uint8_t *udst=dstParam[1] + dstStride[1]*srcSliceY/2;
2205     uint8_t *vdst=dstParam[2] + dstStride[2]*srcSliceY/2;
2206
2207     uyvytoyuv420(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
2208
2209     if (dstParam[3])
2210         fillPlane(dstParam[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
2211
2212     return srcSliceH;
2213 }
2214
2215 static int uyvyToYuv422Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2216                                int srcSliceH, uint8_t* dstParam[], int dstStride[])
2217 {
2218     uint8_t *ydst=dstParam[0] + dstStride[0]*srcSliceY;
2219     uint8_t *udst=dstParam[1] + dstStride[1]*srcSliceY;
2220     uint8_t *vdst=dstParam[2] + dstStride[2]*srcSliceY;
2221
2222     uyvytoyuv422(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0], dstStride[1], srcStride[0]);
2223
2224     return srcSliceH;
2225 }
2226
2227 static void gray8aToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette)
2228 {
2229     int i;
2230     for (i=0; i<num_pixels; i++)
2231         ((uint32_t *) dst)[i] = ((const uint32_t *)palette)[src[i<<1]] | (src[(i<<1)+1] << 24);
2232 }
2233
2234 static void gray8aToPacked32_1(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette)
2235 {
2236     int i;
2237
2238     for (i=0; i<num_pixels; i++)
2239         ((uint32_t *) dst)[i] = ((const uint32_t *)palette)[src[i<<1]] | src[(i<<1)+1];
2240 }
2241
2242 static void gray8aToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette)
2243 {
2244     int i;
2245
2246     for (i=0; i<num_pixels; i++) {
2247         //FIXME slow?
2248         dst[0]= palette[src[i<<1]*4+0];
2249         dst[1]= palette[src[i<<1]*4+1];
2250         dst[2]= palette[src[i<<1]*4+2];
2251         dst+= 3;
2252     }
2253 }
2254
2255 static int palToRgbWrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2256                            int srcSliceH, uint8_t* dst[], int dstStride[])
2257 {
2258     const enum PixelFormat srcFormat= c->srcFormat;
2259     const enum PixelFormat dstFormat= c->dstFormat;
2260     void (*conv)(const uint8_t *src, uint8_t *dst, int num_pixels,
2261                  const uint8_t *palette)=NULL;
2262     int i;
2263     uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
2264     const uint8_t *srcPtr= src[0];
2265
2266     if (srcFormat == PIX_FMT_Y400A) {
2267         switch (dstFormat) {
2268         case PIX_FMT_RGB32  : conv = gray8aToPacked32; break;
2269         case PIX_FMT_BGR32  : conv = gray8aToPacked32; break;
2270         case PIX_FMT_BGR32_1: conv = gray8aToPacked32_1; break;
2271         case PIX_FMT_RGB32_1: conv = gray8aToPacked32_1; break;
2272         case PIX_FMT_RGB24  : conv = gray8aToPacked24; break;
2273         case PIX_FMT_BGR24  : conv = gray8aToPacked24; break;
2274         }
2275     } else if (usePal(srcFormat)) {
2276         switch (dstFormat) {
2277         case PIX_FMT_RGB32  : conv = sws_convertPalette8ToPacked32; break;
2278         case PIX_FMT_BGR32  : conv = sws_convertPalette8ToPacked32; break;
2279         case PIX_FMT_BGR32_1: conv = sws_convertPalette8ToPacked32; break;
2280         case PIX_FMT_RGB32_1: conv = sws_convertPalette8ToPacked32; break;
2281         case PIX_FMT_RGB24  : conv = sws_convertPalette8ToPacked24; break;
2282         case PIX_FMT_BGR24  : conv = sws_convertPalette8ToPacked24; break;
2283         }
2284     }
2285
2286     if (!conv)
2287         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
2288                sws_format_name(srcFormat), sws_format_name(dstFormat));
2289     else {
2290         for (i=0; i<srcSliceH; i++) {
2291             conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb);
2292             srcPtr+= srcStride[0];
2293             dstPtr+= dstStride[0];
2294         }
2295     }
2296
2297     return srcSliceH;
2298 }
2299
2300 #define isRGBA32(x) (            \
2301            (x) == PIX_FMT_ARGB   \
2302         || (x) == PIX_FMT_RGBA   \
2303         || (x) == PIX_FMT_BGRA   \
2304         || (x) == PIX_FMT_ABGR   \
2305         )
2306
2307 /* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
2308 static int rgbToRgbWrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2309                            int srcSliceH, uint8_t* dst[], int dstStride[])
2310 {
2311     const enum PixelFormat srcFormat= c->srcFormat;
2312     const enum PixelFormat dstFormat= c->dstFormat;
2313     const int srcBpp= (c->srcFormatBpp + 7) >> 3;
2314     const int dstBpp= (c->dstFormatBpp + 7) >> 3;
2315     const int srcId= c->srcFormatBpp >> 2; /* 1:0, 4:1, 8:2, 15:3, 16:4, 24:6, 32:8 */
2316     const int dstId= c->dstFormatBpp >> 2;
2317     void (*conv)(const uint8_t *src, uint8_t *dst, int src_size)=NULL;
2318
2319 #define CONV_IS(src, dst) (srcFormat == PIX_FMT_##src && dstFormat == PIX_FMT_##dst)
2320
2321     if (isRGBA32(srcFormat) && isRGBA32(dstFormat)) {
2322         if (     CONV_IS(ABGR, RGBA)
2323               || CONV_IS(ARGB, BGRA)
2324               || CONV_IS(BGRA, ARGB)
2325               || CONV_IS(RGBA, ABGR)) conv = shuffle_bytes_3210;
2326         else if (CONV_IS(ABGR, ARGB)
2327               || CONV_IS(ARGB, ABGR)) conv = shuffle_bytes_0321;
2328         else if (CONV_IS(ABGR, BGRA)
2329               || CONV_IS(ARGB, RGBA)) conv = shuffle_bytes_1230;
2330         else if (CONV_IS(BGRA, RGBA)
2331               || CONV_IS(RGBA, BGRA)) conv = shuffle_bytes_2103;
2332         else if (CONV_IS(BGRA, ABGR)
2333               || CONV_IS(RGBA, ARGB)) conv = shuffle_bytes_3012;
2334     } else
2335     /* BGR -> BGR */
2336     if (  (isBGRinInt(srcFormat) && isBGRinInt(dstFormat))
2337        || (isRGBinInt(srcFormat) && isRGBinInt(dstFormat))) {
2338         switch(srcId | (dstId<<4)) {
2339         case 0x34: conv= rgb16to15; break;
2340         case 0x36: conv= rgb24to15; break;
2341         case 0x38: conv= rgb32to15; break;
2342         case 0x43: conv= rgb15to16; break;
2343         case 0x46: conv= rgb24to16; break;
2344         case 0x48: conv= rgb32to16; break;
2345         case 0x63: conv= rgb15to24; break;
2346         case 0x64: conv= rgb16to24; break;
2347         case 0x68: conv= rgb32to24; break;
2348         case 0x83: conv= rgb15to32; break;
2349         case 0x84: conv= rgb16to32; break;
2350         case 0x86: conv= rgb24to32; break;
2351         }
2352     } else if (  (isBGRinInt(srcFormat) && isRGBinInt(dstFormat))
2353              || (isRGBinInt(srcFormat) && isBGRinInt(dstFormat))) {
2354         switch(srcId | (dstId<<4)) {
2355         case 0x33: conv= rgb15tobgr15; break;
2356         case 0x34: conv= rgb16tobgr15; break;
2357         case 0x36: conv= rgb24tobgr15; break;
2358         case 0x38: conv= rgb32tobgr15; break;
2359         case 0x43: conv= rgb15tobgr16; break;
2360         case 0x44: conv= rgb16tobgr16; break;
2361         case 0x46: conv= rgb24tobgr16; break;
2362         case 0x48: conv= rgb32tobgr16; break;
2363         case 0x63: conv= rgb15tobgr24; break;
2364         case 0x64: conv= rgb16tobgr24; break;
2365         case 0x66: conv= rgb24tobgr24; break;
2366         case 0x68: conv= rgb32tobgr24; break;
2367         case 0x83: conv= rgb15tobgr32; break;
2368         case 0x84: conv= rgb16tobgr32; break;
2369         case 0x86: conv= rgb24tobgr32; break;
2370         }
2371     }
2372
2373     if (!conv) {
2374         av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
2375                sws_format_name(srcFormat), sws_format_name(dstFormat));
2376     } else {
2377         const uint8_t *srcPtr= src[0];
2378               uint8_t *dstPtr= dst[0];
2379         if ((srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1) && !isRGBA32(dstFormat))
2380             srcPtr += ALT32_CORR;
2381
2382         if ((dstFormat == PIX_FMT_RGB32_1 || dstFormat == PIX_FMT_BGR32_1) && !isRGBA32(srcFormat))
2383             dstPtr += ALT32_CORR;
2384
2385         if (dstStride[0]*srcBpp == srcStride[0]*dstBpp && srcStride[0] > 0)
2386             conv(srcPtr, dstPtr + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
2387         else {
2388             int i;
2389             dstPtr += dstStride[0]*srcSliceY;
2390
2391             for (i=0; i<srcSliceH; i++) {
2392                 conv(srcPtr, dstPtr, c->srcW*srcBpp);
2393                 srcPtr+= srcStride[0];
2394                 dstPtr+= dstStride[0];
2395             }
2396         }
2397     }
2398     return srcSliceH;
2399 }
2400
2401 static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2402                               int srcSliceH, uint8_t* dst[], int dstStride[])
2403 {
2404     rgb24toyv12(
2405         src[0],
2406         dst[0]+ srcSliceY    *dstStride[0],
2407         dst[1]+(srcSliceY>>1)*dstStride[1],
2408         dst[2]+(srcSliceY>>1)*dstStride[2],
2409         c->srcW, srcSliceH,
2410         dstStride[0], dstStride[1], srcStride[0]);
2411     if (dst[3])
2412         fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
2413     return srcSliceH;
2414 }
2415
2416 static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2417                              int srcSliceH, uint8_t* dst[], int dstStride[])
2418 {
2419     copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
2420               dst[0], dstStride[0]);
2421
2422     planar2x(src[1], dst[1] + dstStride[1]*(srcSliceY >> 1), c->chrSrcW,
2423              srcSliceH >> 2, srcStride[1], dstStride[1]);
2424     planar2x(src[2], dst[2] + dstStride[2]*(srcSliceY >> 1), c->chrSrcW,
2425              srcSliceH >> 2, srcStride[2], dstStride[2]);
2426     if (dst[3])
2427         fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
2428     return srcSliceH;
2429 }
2430
2431 /* unscaled copy like stuff (assumes nearly identical formats) */
2432 static int packedCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2433                              int srcSliceH, uint8_t* dst[], int dstStride[])
2434 {
2435     if (dstStride[0]==srcStride[0] && srcStride[0] > 0)
2436         memcpy(dst[0] + dstStride[0]*srcSliceY, src[0], srcSliceH*dstStride[0]);
2437     else {
2438         int i;
2439         const uint8_t *srcPtr= src[0];
2440         uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
2441         int length=0;
2442
2443         /* universal length finder */
2444         while(length+c->srcW <= FFABS(dstStride[0])
2445            && length+c->srcW <= FFABS(srcStride[0])) length+= c->srcW;
2446         assert(length!=0);
2447
2448         for (i=0; i<srcSliceH; i++) {
2449             memcpy(dstPtr, srcPtr, length);
2450             srcPtr+= srcStride[0];
2451             dstPtr+= dstStride[0];
2452         }
2453     }
2454     return srcSliceH;
2455 }
2456
2457 static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2458                              int srcSliceH, uint8_t* dst[], int dstStride[])
2459 {
2460     int plane, i, j;
2461     for (plane=0; plane<4; plane++) {
2462         int length= (plane==0 || plane==3) ? c->srcW  : -((-c->srcW  )>>c->chrDstHSubSample);
2463         int y=      (plane==0 || plane==3) ? srcSliceY: -((-srcSliceY)>>c->chrDstVSubSample);
2464         int height= (plane==0 || plane==3) ? srcSliceH: -((-srcSliceH)>>c->chrDstVSubSample);
2465         const uint8_t *srcPtr= src[plane];
2466         uint8_t *dstPtr= dst[plane] + dstStride[plane]*y;
2467
2468         if (!dst[plane]) continue;
2469         // ignore palette for GRAY8
2470         if (plane == 1 && !dst[2]) continue;
2471         if (!src[plane] || (plane == 1 && !src[2])) {
2472             if(is16BPS(c->dstFormat))
2473                 length*=2;
2474             fillPlane(dst[plane], dstStride[plane], length, height, y, (plane==3) ? 255 : 128);
2475         } else {
2476             if(is9_OR_10BPS(c->srcFormat)) {
2477                 const int src_depth = av_pix_fmt_descriptors[c->srcFormat].comp[plane].depth_minus1+1;
2478                 const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1+1;
2479                 const uint16_t *srcPtr2 = (const uint16_t*)srcPtr;
2480
2481                 if (is16BPS(c->dstFormat)) {
2482                     uint16_t *dstPtr2 = (uint16_t*)dstPtr;
2483 #define COPY9_OR_10TO16(rfunc, wfunc) \
2484                     for (i = 0; i < height; i++) { \
2485                         for (j = 0; j < length; j++) { \
2486                             int srcpx = rfunc(&srcPtr2[j]); \
2487                             wfunc(&dstPtr2[j], (srcpx<<(16-src_depth)) | (srcpx>>(2*src_depth-16))); \
2488                         } \
2489                         dstPtr2 += dstStride[plane]/2; \
2490                         srcPtr2 += srcStride[plane]/2; \
2491                     }
2492                     if (isBE(c->dstFormat)) {
2493                         if (isBE(c->srcFormat)) {
2494                             COPY9_OR_10TO16(AV_RB16, AV_WB16);
2495                         } else {
2496                             COPY9_OR_10TO16(AV_RL16, AV_WB16);
2497                         }
2498                     } else {
2499                         if (isBE(c->srcFormat)) {
2500                             COPY9_OR_10TO16(AV_RB16, AV_WL16);
2501                         } else {
2502                             COPY9_OR_10TO16(AV_RL16, AV_WL16);
2503                         }
2504                     }
2505                 } else if (is9_OR_10BPS(c->dstFormat)) {
2506                     uint16_t *dstPtr2 = (uint16_t*)dstPtr;
2507 #define COPY9_OR_10TO9_OR_10(loop) \
2508                     for (i = 0; i < height; i++) { \
2509                         for (j = 0; j < length; j++) { \
2510                             loop; \
2511                         } \
2512                         dstPtr2 += dstStride[plane]/2; \
2513                         srcPtr2 += srcStride[plane]/2; \
2514                     }
2515 #define COPY9_OR_10TO9_OR_10_2(rfunc, wfunc) \
2516                     if (dst_depth > src_depth) { \
2517                         COPY9_OR_10TO9_OR_10(int srcpx = rfunc(&srcPtr2[j]); \
2518                             wfunc(&dstPtr2[j], (srcpx << 1) | (srcpx >> 9))); \
2519                     } else if (dst_depth < src_depth) { \
2520                         COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]) >> 1)); \
2521                     } else { \
2522                         COPY9_OR_10TO9_OR_10(wfunc(&dstPtr2[j], rfunc(&srcPtr2[j]))); \
2523                     }
2524                     if (isBE(c->dstFormat)) {
2525                         if (isBE(c->srcFormat)) {
2526                             COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WB16);
2527                         } else {
2528                             COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WB16);
2529                         }
2530                     } else {
2531                         if (isBE(c->srcFormat)) {
2532                             COPY9_OR_10TO9_OR_10_2(AV_RB16, AV_WL16);
2533                         } else {
2534                             COPY9_OR_10TO9_OR_10_2(AV_RL16, AV_WL16);
2535                         }
2536                     }
2537                 } else {
2538                     // FIXME Maybe dither instead.
2539 #define COPY9_OR_10TO8(rfunc) \
2540                     for (i = 0; i < height; i++) { \
2541                         for (j = 0; j < length; j++) { \
2542                             dstPtr[j] = rfunc(&srcPtr2[j])>>(src_depth-8); \
2543                         } \
2544                         dstPtr  += dstStride[plane]; \
2545                         srcPtr2 += srcStride[plane]/2; \
2546                     }
2547                     if (isBE(c->srcFormat)) {
2548                         COPY9_OR_10TO8(AV_RB16);
2549                     } else {
2550                         COPY9_OR_10TO8(AV_RL16);
2551                     }
2552                 }
2553             } else if(is9_OR_10BPS(c->dstFormat)) {
2554                 const int dst_depth = av_pix_fmt_descriptors[c->dstFormat].comp[plane].depth_minus1+1;
2555                 uint16_t *dstPtr2 = (uint16_t*)dstPtr;
2556
2557                 if (is16BPS(c->srcFormat)) {
2558                     const uint16_t *srcPtr2 = (const uint16_t*)srcPtr;
2559 #define COPY16TO9_OR_10(rfunc, wfunc) \
2560                     for (i = 0; i < height; i++) { \
2561                         for (j = 0; j < length; j++) { \
2562                             wfunc(&dstPtr2[j], rfunc(&srcPtr2[j])>>(16-dst_depth)); \
2563                         } \
2564                         dstPtr2 += dstStride[plane]/2; \
2565                         srcPtr2 += srcStride[plane]/2; \
2566                     }
2567                     if (isBE(c->dstFormat)) {
2568                         if (isBE(c->srcFormat)) {
2569                             COPY16TO9_OR_10(AV_RB16, AV_WB16);
2570                         } else {
2571                             COPY16TO9_OR_10(AV_RL16, AV_WB16);
2572                         }
2573                     } else {
2574                         if (isBE(c->srcFormat)) {
2575                             COPY16TO9_OR_10(AV_RB16, AV_WL16);
2576                         } else {
2577                             COPY16TO9_OR_10(AV_RL16, AV_WL16);
2578                         }
2579                     }
2580                 } else /* 8bit */ {
2581 #define COPY8TO9_OR_10(wfunc) \
2582                     for (i = 0; i < height; i++) { \
2583                         for (j = 0; j < length; j++) { \
2584                             const int srcpx = srcPtr[j]; \
2585                             wfunc(&dstPtr2[j], (srcpx<<(dst_depth-8)) | (srcpx >> (16-dst_depth))); \
2586                         } \
2587                         dstPtr2 += dstStride[plane]/2; \
2588                         srcPtr  += srcStride[plane]; \
2589                     }
2590                     if (isBE(c->dstFormat)) {
2591                         COPY8TO9_OR_10(AV_WB16);
2592                     } else {
2593                         COPY8TO9_OR_10(AV_WL16);
2594                     }
2595                 }
2596             } else if(is16BPS(c->srcFormat) && !is16BPS(c->dstFormat)) {
2597                 if (!isBE(c->srcFormat)) srcPtr++;
2598                 for (i=0; i<height; i++) {
2599                     for (j=0; j<length; j++) dstPtr[j] = srcPtr[j<<1];
2600                     srcPtr+= srcStride[plane];
2601                     dstPtr+= dstStride[plane];
2602                 }
2603             } else if(!is16BPS(c->srcFormat) && is16BPS(c->dstFormat)) {
2604                 for (i=0; i<height; i++) {
2605                     for (j=0; j<length; j++) {
2606                         dstPtr[ j<<1   ] = srcPtr[j];
2607                         dstPtr[(j<<1)+1] = srcPtr[j];
2608                     }
2609                     srcPtr+= srcStride[plane];
2610                     dstPtr+= dstStride[plane];
2611                 }
2612             } else if(is16BPS(c->srcFormat) && is16BPS(c->dstFormat)
2613                   && isBE(c->srcFormat) != isBE(c->dstFormat)) {
2614
2615                 for (i=0; i<height; i++) {
2616                     for (j=0; j<length; j++)
2617                         ((uint16_t*)dstPtr)[j] = av_bswap16(((const uint16_t*)srcPtr)[j]);
2618                     srcPtr+= srcStride[plane];
2619                     dstPtr+= dstStride[plane];
2620                 }
2621             } else if (dstStride[plane] == srcStride[plane] &&
2622                        srcStride[plane] > 0 && srcStride[plane] == length) {
2623                 memcpy(dst[plane] + dstStride[plane]*y, src[plane],
2624                        height*dstStride[plane]);
2625             } else {
2626                 if(is16BPS(c->srcFormat) && is16BPS(c->dstFormat))
2627                     length*=2;
2628                 for (i=0; i<height; i++) {
2629                     memcpy(dstPtr, srcPtr, length);
2630                     srcPtr+= srcStride[plane];
2631                     dstPtr+= dstStride[plane];
2632                 }
2633             }
2634         }
2635     }
2636     return srcSliceH;
2637 }
2638
2639 void ff_get_unscaled_swscale(SwsContext *c)
2640 {
2641     const enum PixelFormat srcFormat = c->srcFormat;
2642     const enum PixelFormat dstFormat = c->dstFormat;
2643     const int flags = c->flags;
2644     const int dstH = c->dstH;
2645     int needsDither;
2646
2647     needsDither= isAnyRGB(dstFormat)
2648         &&  c->dstFormatBpp < 24
2649         && (c->dstFormatBpp < c->srcFormatBpp || (!isAnyRGB(srcFormat)));
2650
2651     /* yv12_to_nv12 */
2652     if ((srcFormat == PIX_FMT_YUV420P || srcFormat == PIX_FMT_YUVA420P) && (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)) {
2653         c->swScale= planarToNv12Wrapper;
2654     }
2655     /* yuv2bgr */
2656     if ((srcFormat==PIX_FMT_YUV420P || srcFormat==PIX_FMT_YUV422P || srcFormat==PIX_FMT_YUVA420P) && isAnyRGB(dstFormat)
2657         && !(flags & SWS_ACCURATE_RND) && !(dstH&1)) {
2658         c->swScale= ff_yuv2rgb_get_func_ptr(c);
2659     }
2660
2661     if (srcFormat==PIX_FMT_YUV410P && (dstFormat==PIX_FMT_YUV420P || dstFormat==PIX_FMT_YUVA420P) && !(flags & SWS_BITEXACT)) {
2662         c->swScale= yvu9ToYv12Wrapper;
2663     }
2664
2665     /* bgr24toYV12 */
2666     if (srcFormat==PIX_FMT_BGR24 && (dstFormat==PIX_FMT_YUV420P || dstFormat==PIX_FMT_YUVA420P) && !(flags & SWS_ACCURATE_RND))
2667         c->swScale= bgr24ToYv12Wrapper;
2668
2669     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
2670     if (   isAnyRGB(srcFormat)
2671         && isAnyRGB(dstFormat)
2672         && srcFormat != PIX_FMT_BGR8      && dstFormat != PIX_FMT_BGR8
2673         && srcFormat != PIX_FMT_RGB8      && dstFormat != PIX_FMT_RGB8
2674         && srcFormat != PIX_FMT_BGR4      && dstFormat != PIX_FMT_BGR4
2675         && srcFormat != PIX_FMT_RGB4      && dstFormat != PIX_FMT_RGB4
2676         && srcFormat != PIX_FMT_BGR4_BYTE && dstFormat != PIX_FMT_BGR4_BYTE
2677         && srcFormat != PIX_FMT_RGB4_BYTE && dstFormat != PIX_FMT_RGB4_BYTE
2678         && srcFormat != PIX_FMT_MONOBLACK && dstFormat != PIX_FMT_MONOBLACK
2679         && srcFormat != PIX_FMT_MONOWHITE && dstFormat != PIX_FMT_MONOWHITE
2680         && srcFormat != PIX_FMT_RGB48LE   && dstFormat != PIX_FMT_RGB48LE
2681         && srcFormat != PIX_FMT_RGB48BE   && dstFormat != PIX_FMT_RGB48BE
2682         && srcFormat != PIX_FMT_BGR48LE   && dstFormat != PIX_FMT_BGR48LE
2683         && srcFormat != PIX_FMT_BGR48BE   && dstFormat != PIX_FMT_BGR48BE
2684         && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
2685         c->swScale= rgbToRgbWrapper;
2686
2687     if ((usePal(srcFormat) && (
2688         dstFormat == PIX_FMT_RGB32   ||
2689         dstFormat == PIX_FMT_RGB32_1 ||
2690         dstFormat == PIX_FMT_RGB24   ||
2691         dstFormat == PIX_FMT_BGR32   ||
2692         dstFormat == PIX_FMT_BGR32_1 ||
2693         dstFormat == PIX_FMT_BGR24)))
2694         c->swScale= palToRgbWrapper;
2695
2696     if (srcFormat == PIX_FMT_YUV422P) {
2697         if (dstFormat == PIX_FMT_YUYV422)
2698             c->swScale= yuv422pToYuy2Wrapper;
2699         else if (dstFormat == PIX_FMT_UYVY422)
2700             c->swScale= yuv422pToUyvyWrapper;
2701     }
2702
2703     /* LQ converters if -sws 0 or -sws 4*/
2704     if (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)) {
2705         /* yv12_to_yuy2 */
2706         if (srcFormat == PIX_FMT_YUV420P || srcFormat == PIX_FMT_YUVA420P) {
2707             if (dstFormat == PIX_FMT_YUYV422)
2708                 c->swScale= planarToYuy2Wrapper;
2709             else if (dstFormat == PIX_FMT_UYVY422)
2710                 c->swScale= planarToUyvyWrapper;
2711         }
2712     }
2713     if(srcFormat == PIX_FMT_YUYV422 && (dstFormat == PIX_FMT_YUV420P || dstFormat == PIX_FMT_YUVA420P))
2714         c->swScale= yuyvToYuv420Wrapper;
2715     if(srcFormat == PIX_FMT_UYVY422 && (dstFormat == PIX_FMT_YUV420P || dstFormat == PIX_FMT_YUVA420P))
2716         c->swScale= uyvyToYuv420Wrapper;
2717     if(srcFormat == PIX_FMT_YUYV422 && dstFormat == PIX_FMT_YUV422P)
2718         c->swScale= yuyvToYuv422Wrapper;
2719     if(srcFormat == PIX_FMT_UYVY422 && dstFormat == PIX_FMT_YUV422P)
2720         c->swScale= uyvyToYuv422Wrapper;
2721
2722     /* simple copy */
2723     if (  srcFormat == dstFormat
2724         || (srcFormat == PIX_FMT_YUVA420P && dstFormat == PIX_FMT_YUV420P)
2725         || (srcFormat == PIX_FMT_YUV420P && dstFormat == PIX_FMT_YUVA420P)
2726         || (isPlanarYUV(srcFormat) && isGray(dstFormat))
2727         || (isPlanarYUV(dstFormat) && isGray(srcFormat))
2728         || (isGray(dstFormat) && isGray(srcFormat))
2729         || (isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat)
2730             && c->chrDstHSubSample == c->chrSrcHSubSample
2731             && c->chrDstVSubSample == c->chrSrcVSubSample
2732             && dstFormat != PIX_FMT_NV12 && dstFormat != PIX_FMT_NV21
2733             && srcFormat != PIX_FMT_NV12 && srcFormat != PIX_FMT_NV21))
2734     {
2735         if (isPacked(c->srcFormat))
2736             c->swScale= packedCopyWrapper;
2737         else /* Planar YUV or gray */
2738             c->swScale= planarCopyWrapper;
2739     }
2740
2741     if (ARCH_BFIN)
2742         ff_bfin_get_unscaled_swscale(c);
2743     if (HAVE_ALTIVEC)
2744         ff_swscale_get_unscaled_altivec(c);
2745 }
2746
2747 static void reset_ptr(const uint8_t* src[], int format)
2748 {
2749     if(!isALPHA(format))
2750         src[3]=NULL;
2751     if(!isPlanarYUV(format)) {
2752         src[3]=src[2]=NULL;
2753
2754         if (!usePal(format))
2755             src[1]= NULL;
2756     }
2757 }
2758
2759 static int check_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt,
2760                                 const int linesizes[4])
2761 {
2762     const AVPixFmtDescriptor *desc = &av_pix_fmt_descriptors[pix_fmt];
2763     int i;
2764
2765     for (i = 0; i < 4; i++) {
2766         int plane = desc->comp[i].plane;
2767         if (!data[plane] || !linesizes[plane])
2768             return 0;
2769     }
2770
2771     return 1;
2772 }
2773
2774 /**
2775  * swscale wrapper, so we don't need to export the SwsContext.
2776  * Assumes planar YUV to be in YUV order instead of YVU.
2777  */
2778 int sws_scale(SwsContext *c, const uint8_t* const src[], const int srcStride[], int srcSliceY,
2779               int srcSliceH, uint8_t* const dst[], const int dstStride[])
2780 {
2781     int i;
2782     const uint8_t* src2[4]= {src[0], src[1], src[2], src[3]};
2783     uint8_t* dst2[4]= {dst[0], dst[1], dst[2], dst[3]};
2784
2785     // do not mess up sliceDir if we have a "trailing" 0-size slice
2786     if (srcSliceH == 0)
2787         return 0;
2788
2789     if (!check_image_pointers(src, c->srcFormat, srcStride)) {
2790         av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
2791         return 0;
2792     }
2793     if (!check_image_pointers(dst, c->dstFormat, dstStride)) {
2794         av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
2795         return 0;
2796     }
2797
2798     if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
2799         av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
2800         return 0;
2801     }
2802     if (c->sliceDir == 0) {
2803         if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
2804     }
2805
2806     if (usePal(c->srcFormat)) {
2807         for (i=0; i<256; i++) {
2808             int p, r, g, b,y,u,v;
2809             if(c->srcFormat == PIX_FMT_PAL8) {
2810                 p=((const uint32_t*)(src[1]))[i];
2811                 r= (p>>16)&0xFF;
2812                 g= (p>> 8)&0xFF;
2813                 b=  p     &0xFF;
2814             } else if(c->srcFormat == PIX_FMT_RGB8) {
2815                 r= (i>>5    )*36;
2816                 g= ((i>>2)&7)*36;
2817                 b= (i&3     )*85;
2818             } else if(c->srcFormat == PIX_FMT_BGR8) {
2819                 b= (i>>6    )*85;
2820                 g= ((i>>3)&7)*36;
2821                 r= (i&7     )*36;
2822             } else if(c->srcFormat == PIX_FMT_RGB4_BYTE) {
2823                 r= (i>>3    )*255;
2824                 g= ((i>>1)&3)*85;
2825                 b= (i&1     )*255;
2826             } else if(c->srcFormat == PIX_FMT_GRAY8 || c->srcFormat == PIX_FMT_Y400A) {
2827                 r = g = b = i;
2828             } else {
2829                 assert(c->srcFormat == PIX_FMT_BGR4_BYTE);
2830                 b= (i>>3    )*255;
2831                 g= ((i>>1)&3)*85;
2832                 r= (i&1     )*255;
2833             }
2834             y= av_clip_uint8((RY*r + GY*g + BY*b + ( 33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2835             u= av_clip_uint8((RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2836             v= av_clip_uint8((RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2837             c->pal_yuv[i]= y + (u<<8) + (v<<16);
2838
2839             switch(c->dstFormat) {
2840             case PIX_FMT_BGR32:
2841 #if !HAVE_BIGENDIAN
2842             case PIX_FMT_RGB24:
2843 #endif
2844                 c->pal_rgb[i]=  r + (g<<8) + (b<<16);
2845                 break;
2846             case PIX_FMT_BGR32_1:
2847 #if HAVE_BIGENDIAN
2848             case PIX_FMT_BGR24:
2849 #endif
2850                 c->pal_rgb[i]= (r + (g<<8) + (b<<16)) << 8;
2851                 break;
2852             case PIX_FMT_RGB32_1:
2853 #if HAVE_BIGENDIAN
2854             case PIX_FMT_RGB24:
2855 #endif
2856                 c->pal_rgb[i]= (b + (g<<8) + (r<<16)) << 8;
2857                 break;
2858             case PIX_FMT_RGB32:
2859 #if !HAVE_BIGENDIAN
2860             case PIX_FMT_BGR24:
2861 #endif
2862             default:
2863                 c->pal_rgb[i]=  b + (g<<8) + (r<<16);
2864             }
2865         }
2866     }
2867
2868     // copy strides, so they can safely be modified
2869     if (c->sliceDir == 1) {
2870         // slices go from top to bottom
2871         int srcStride2[4]= {srcStride[0], srcStride[1], srcStride[2], srcStride[3]};
2872         int dstStride2[4]= {dstStride[0], dstStride[1], dstStride[2], dstStride[3]};
2873
2874         reset_ptr(src2, c->srcFormat);
2875         reset_ptr((const uint8_t**)dst2, c->dstFormat);
2876
2877         /* reset slice direction at end of frame */
2878         if (srcSliceY + srcSliceH == c->srcH)
2879             c->sliceDir = 0;
2880
2881         return c->swScale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2, dstStride2);
2882     } else {
2883         // slices go from bottom to top => we flip the image internally
2884         int srcStride2[4]= {-srcStride[0], -srcStride[1], -srcStride[2], -srcStride[3]};
2885         int dstStride2[4]= {-dstStride[0], -dstStride[1], -dstStride[2], -dstStride[3]};
2886
2887         src2[0] += (srcSliceH-1)*srcStride[0];
2888         if (!usePal(c->srcFormat))
2889             src2[1] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[1];
2890         src2[2] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[2];
2891         src2[3] += (srcSliceH-1)*srcStride[3];
2892         dst2[0] += ( c->dstH                      -1)*dstStride[0];
2893         dst2[1] += ((c->dstH>>c->chrDstVSubSample)-1)*dstStride[1];
2894         dst2[2] += ((c->dstH>>c->chrDstVSubSample)-1)*dstStride[2];
2895         dst2[3] += ( c->dstH                      -1)*dstStride[3];
2896
2897         reset_ptr(src2, c->srcFormat);
2898         reset_ptr((const uint8_t**)dst2, c->dstFormat);
2899
2900         /* reset slice direction at end of frame */
2901         if (!srcSliceY)
2902             c->sliceDir = 0;
2903
2904         return c->swScale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH, srcSliceH, dst2, dstStride2);
2905     }
2906 }
2907
2908 /* Convert the palette to the same packed 32-bit format as the palette */
2909 void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette)
2910 {
2911     int i;
2912
2913     for (i=0; i<num_pixels; i++)
2914         ((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i]];
2915 }
2916
2917 /* Palette format: ABCD -> dst format: ABC */
2918 void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette)
2919 {
2920     int i;
2921
2922     for (i=0; i<num_pixels; i++) {
2923         //FIXME slow?
2924         dst[0]= palette[src[i]*4+0];
2925         dst[1]= palette[src[i]*4+1];
2926         dst[2]= palette[src[i]*4+2];
2927         dst+= 3;
2928     }
2929 }