git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 201                       int chrFilterSize, const int16_t **alpSrc,
 202                       uint16_t *dest, uint16_t *uDest, uint16_t *vDest,
 203                       uint16_t *aDest, int dstW, int chrDstW,
 204                       int big_endian, int output_bits)
 205 {
 206     //FIXME Optimize (just quickly written not optimized..)
 207     int i;
 208     int shift = 11 + 16 - output_bits;
 209
 210 #define output_pixel(pos, val) \
 211     if (big_endian) { \
 212         if (output_bits == 16) { \
 213             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 214         } else { \
 215             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 216         } \
 217     } else { \
 218         if (output_bits == 16) { \
 219             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 220         } else { \
 221             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 222         } \
 223     }
 224     for (i = 0; i < dstW; i++) {
 225         int val = 1 << (26-output_bits);
 226         int j;
 227
 228         for (j = 0; j < lumFilterSize; j++)
 229             val += lumSrc[j][i] * lumFilter[j];
 230
 231         output_pixel(&dest[i], val);
 232     }
 233
 234     if (uDest) {
 235         for (i = 0; i < chrDstW; i++) {
 236             int u = 1 << (26-output_bits);
 237             int v = 1 << (26-output_bits);
 238             int j;
 239
 240             for (j = 0; j < chrFilterSize; j++) {
 241                 u += chrUSrc[j][i] * chrFilter[j];
 242                 v += chrVSrc[j][i] * chrFilter[j];
 243             }
 244
 245             output_pixel(&uDest[i], u);
 246             output_pixel(&vDest[i], v);
 247         }
 248     }
 249
 250     if (CONFIG_SWSCALE_ALPHA && aDest) {
 251         for (i = 0; i < dstW; i++) {
 252             int val = 1 << (26-output_bits);
 253             int j;
 254
 255             for (j = 0; j < lumFilterSize; j++)
 256                 val += alpSrc[j][i] * lumFilter[j];
 257
 258             output_pixel(&aDest[i], val);
 259         }
 260     }
 261 #undef output_pixel
 262 }
 263
 264 #define yuv2NBPS(bits, BE_LE, is_be) \
 265 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 266                               const int16_t **lumSrc, int lumFilterSize, \
 267                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 268                               const int16_t **chrVSrc, \
 269                               int chrFilterSize, const int16_t **alpSrc, \
 270                               uint8_t *_dest, uint8_t *_uDest, uint8_t *_vDest, \
 271                               uint8_t *_aDest, int dstW, int chrDstW) \
 272 { \
 273     uint16_t *dest  = (uint16_t *) _dest,  *uDest = (uint16_t *) _uDest, \
 274              *vDest = (uint16_t *) _vDest, *aDest = (uint16_t *) _aDest; \
 275     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 276                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 277                           alpSrc, \
 278                           dest, uDest, vDest, aDest, \
 279                           dstW, chrDstW, is_be, bits); \
 280 }
 281 yuv2NBPS( 9, BE, 1);
 282 yuv2NBPS( 9, LE, 0);
 283 yuv2NBPS(10, BE, 1);
 284 yuv2NBPS(10, LE, 0);
 285 yuv2NBPS(16, BE, 1);
 286 yuv2NBPS(16, LE, 0);
 287
 288 static inline void yuv2yuvX16_c(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 289                                 const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize,
 290                                 const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, int dstW, int chrDstW,
 291                                 enum PixelFormat dstFormat)
 292 {
 293 #define conv16(bits) \
 294     if (isBE(dstFormat)) { \
 295         yuv2yuvX ## bits ## BE_c(c, lumFilter, lumSrc, lumFilterSize, \
 296                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 297                                  alpSrc, \
 298                                  dest, uDest, vDest, aDest, \
 299                                  dstW, chrDstW); \
 300     } else { \
 301         yuv2yuvX ## bits ## LE_c(c, lumFilter, lumSrc, lumFilterSize, \
 302                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 303                                  alpSrc, \
 304                                  dest, uDest, vDest, aDest, \
 305                                  dstW, chrDstW); \
 306     }
 307     if (is16BPS(dstFormat)) {
 308         conv16(16);
 309     } else if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
 310         conv16(9);
 311     } else {
 312         conv16(10);
 313     }
 314 #undef conv16
 315 }
 316
 317 static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 318                               const int16_t **lumSrc, int lumFilterSize,
 319                               const int16_t *chrFilter, const int16_t **chrUSrc,
 320                               const int16_t **chrVSrc,
 321                               int chrFilterSize, const int16_t **alpSrc,
 322                               uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 323                               uint8_t *aDest, int dstW, int chrDstW)
 324 {
 325     //FIXME Optimize (just quickly written not optimized..)
 326     int i;
 327     for (i=0; i<dstW; i++) {
 328         int val=1<<18;
 329         int j;
 330         for (j=0; j<lumFilterSize; j++)
 331             val += lumSrc[j][i] * lumFilter[j];
 332
 333         dest[i]= av_clip_uint8(val>>19);
 334     }
 335
 336     if (uDest)
 337         for (i=0; i<chrDstW; i++) {
 338             int u=1<<18;
 339             int v=1<<18;
 340             int j;
 341             for (j=0; j<chrFilterSize; j++) {
 342                 u += chrUSrc[j][i] * chrFilter[j];
 343                 v += chrVSrc[j][i] * chrFilter[j];
 344             }
 345
 346             uDest[i]= av_clip_uint8(u>>19);
 347             vDest[i]= av_clip_uint8(v>>19);
 348         }
 349
 350     if (CONFIG_SWSCALE_ALPHA && aDest)
 351         for (i=0; i<dstW; i++) {
 352             int val=1<<18;
 353             int j;
 354             for (j=0; j<lumFilterSize; j++)
 355                 val += alpSrc[j][i] * lumFilter[j];
 356
 357             aDest[i]= av_clip_uint8(val>>19);
 358         }
 359
 360 }
 361
 362 static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 363                                const int16_t **lumSrc, int lumFilterSize,
 364                                const int16_t *chrFilter, const int16_t **chrUSrc,
 365                                const int16_t **chrVSrc,
 366                                int chrFilterSize, uint8_t *dest, uint8_t *uDest,
 367                                int dstW, int chrDstW, enum PixelFormat dstFormat)
 368 {
 369     //FIXME Optimize (just quickly written not optimized..)
 370     int i;
 371     for (i=0; i<dstW; i++) {
 372         int val=1<<18;
 373         int j;
 374         for (j=0; j<lumFilterSize; j++)
 375             val += lumSrc[j][i] * lumFilter[j];
 376
 377         dest[i]= av_clip_uint8(val>>19);
 378     }
 379
 380     if (!uDest)
 381         return;
 382
 383     if (dstFormat == PIX_FMT_NV12)
 384         for (i=0; i<chrDstW; i++) {
 385             int u=1<<18;
 386             int v=1<<18;
 387             int j;
 388             for (j=0; j<chrFilterSize; j++) {
 389                 u += chrUSrc[j][i] * chrFilter[j];
 390                 v += chrVSrc[j][i] * chrFilter[j];
 391             }
 392
 393             uDest[2*i]= av_clip_uint8(u>>19);
 394             uDest[2*i+1]= av_clip_uint8(v>>19);
 395         }
 396     else
 397         for (i=0; i<chrDstW; i++) {
 398             int u=1<<18;
 399             int v=1<<18;
 400             int j;
 401             for (j=0; j<chrFilterSize; j++) {
 402                 u += chrUSrc[j][i] * chrFilter[j];
 403                 v += chrVSrc[j][i] * chrFilter[j];
 404             }
 405
 406             uDest[2*i]= av_clip_uint8(v>>19);
 407             uDest[2*i+1]= av_clip_uint8(u>>19);
 408         }
 409 }
 410
 411 #define YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha) \
 412     for (i=0; i<(dstW>>1); i++) {\
 413         int j;\
 414         int Y1 = 1<<18;\
 415         int Y2 = 1<<18;\
 416         int U  = 1<<18;\
 417         int V  = 1<<18;\
 418         int av_unused A1, A2;\
 419         type av_unused *r, *b, *g;\
 420         const int i2= 2*i;\
 421         \
 422         for (j=0; j<lumFilterSize; j++) {\
 423             Y1 += lumSrc[j][i2] * lumFilter[j];\
 424             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 425         }\
 426         for (j=0; j<chrFilterSize; j++) {\
 427             U += chrUSrc[j][i] * chrFilter[j];\
 428             V += chrVSrc[j][i] * chrFilter[j];\
 429         }\
 430         Y1>>=19;\
 431         Y2>>=19;\
 432         U >>=19;\
 433         V >>=19;\
 434         if (alpha) {\
 435             A1 = 1<<18;\
 436             A2 = 1<<18;\
 437             for (j=0; j<lumFilterSize; j++) {\
 438                 A1 += alpSrc[j][i2  ] * lumFilter[j];\
 439                 A2 += alpSrc[j][i2+1] * lumFilter[j];\
 440             }\
 441             A1>>=19;\
 442             A2>>=19;\
 443         }
 444
 445 #define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
 446         YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha)\
 447         if ((Y1|Y2|U|V)&256) {\
 448             if (Y1>255)   Y1=255; \
 449             else if (Y1<0)Y1=0;   \
 450             if (Y2>255)   Y2=255; \
 451             else if (Y2<0)Y2=0;   \
 452             if (U>255)    U=255;  \
 453             else if (U<0) U=0;    \
 454             if (V>255)    V=255;  \
 455             else if (V<0) V=0;    \
 456         }\
 457         if (alpha && ((A1|A2)&256)) {\
 458             A1=av_clip_uint8(A1);\
 459             A2=av_clip_uint8(A2);\
 460         }
 461
 462 #define YSCALE_YUV_2_PACKEDX_FULL_C(rnd,alpha) \
 463     for (i=0; i<dstW; i++) {\
 464         int j;\
 465         int Y = 0;\
 466         int U = -128<<19;\
 467         int V = -128<<19;\
 468         int av_unused A;\
 469         int R,G,B;\
 470         \
 471         for (j=0; j<lumFilterSize; j++) {\
 472             Y += lumSrc[j][i     ] * lumFilter[j];\
 473         }\
 474         for (j=0; j<chrFilterSize; j++) {\
 475             U += chrUSrc[j][i] * chrFilter[j];\
 476             V += chrVSrc[j][i] * chrFilter[j];\
 477         }\
 478         Y >>=10;\
 479         U >>=10;\
 480         V >>=10;\
 481         if (alpha) {\
 482             A = rnd;\
 483             for (j=0; j<lumFilterSize; j++)\
 484                 A += alpSrc[j][i     ] * lumFilter[j];\
 485             A >>=19;\
 486             if (A&256)\
 487                 A = av_clip_uint8(A);\
 488         }
 489
 490 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
 491     YSCALE_YUV_2_PACKEDX_FULL_C(rnd>>3,alpha)\
 492         Y-= c->yuv2rgb_y_offset;\
 493         Y*= c->yuv2rgb_y_coeff;\
 494         Y+= rnd;\
 495         R= Y + V*c->yuv2rgb_v2r_coeff;\
 496         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
 497         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
 498         if ((R|G|B)&(0xC0000000)) {\
 499             if (R>=(256<<22))   R=(256<<22)-1; \
 500             else if (R<0)R=0;   \
 501             if (G>=(256<<22))   G=(256<<22)-1; \
 502             else if (G<0)G=0;   \
 503             if (B>=(256<<22))   B=(256<<22)-1; \
 504             else if (B<0)B=0;   \
 505         }
 506
 507 #define YSCALE_YUV_2_GRAY16_C \
 508     for (i=0; i<(dstW>>1); i++) {\
 509         int j;\
 510         int Y1 = 1<<18;\
 511         int Y2 = 1<<18;\
 512         int U  = 1<<18;\
 513         int V  = 1<<18;\
 514         \
 515         const int i2= 2*i;\
 516         \
 517         for (j=0; j<lumFilterSize; j++) {\
 518             Y1 += lumSrc[j][i2] * lumFilter[j];\
 519             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 520         }\
 521         Y1>>=11;\
 522         Y2>>=11;\
 523         if ((Y1|Y2|U|V)&65536) {\
 524             if (Y1>65535)   Y1=65535; \
 525             else if (Y1<0)Y1=0;   \
 526             if (Y2>65535)   Y2=65535; \
 527             else if (Y2<0)Y2=0;   \
 528         }
 529
 530 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
 531     YSCALE_YUV_2_PACKEDX_C(type,alpha)  /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
 532     r = (type *)c->table_rV[V];   \
 533     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
 534     b = (type *)c->table_bU[U];
 535
 536 #define YSCALE_YUV_2_PACKED2_C(type,alpha)   \
 537     for (i=0; i<(dstW>>1); i++) { \
 538         const int i2= 2*i;       \
 539         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>19;           \
 540         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;           \
 541         int U= (ubuf0[i]*uvalpha1+ubuf1[i]*uvalpha)>>19;              \
 542         int V= (vbuf0[i]*uvalpha1+vbuf1[i]*uvalpha)>>19;              \
 543         type av_unused *r, *b, *g;                                    \
 544         int av_unused A1, A2;                                         \
 545         if (alpha) {\
 546             A1= (abuf0[i2  ]*yalpha1+abuf1[i2  ]*yalpha)>>19;         \
 547             A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
 548         }
 549
 550 #define YSCALE_YUV_2_GRAY16_2_C   \
 551     for (i=0; i<(dstW>>1); i++) { \
 552         const int i2= 2*i;       \
 553         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>11;           \
 554         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>11;
 555
 556 #define YSCALE_YUV_2_RGB2_C(type,alpha) \
 557     YSCALE_YUV_2_PACKED2_C(type,alpha)\
 558     r = (type *)c->table_rV[V];\
 559     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 560     b = (type *)c->table_bU[U];
 561
 562 #define YSCALE_YUV_2_PACKED1_C(type,alpha) \
 563     for (i=0; i<(dstW>>1); i++) {\
 564         const int i2= 2*i;\
 565         int Y1= buf0[i2  ]>>7;\
 566         int Y2= buf0[i2+1]>>7;\
 567         int U= (ubuf1[i])>>7;\
 568         int V= (vbuf1[i])>>7;\
 569         type av_unused *r, *b, *g;\
 570         int av_unused A1, A2;\
 571         if (alpha) {\
 572             A1= abuf0[i2  ]>>7;\
 573             A2= abuf0[i2+1]>>7;\
 574         }
 575
 576 #define YSCALE_YUV_2_GRAY16_1_C \
 577     for (i=0; i<(dstW>>1); i++) {\
 578         const int i2= 2*i;\
 579         int Y1= buf0[i2  ]<<1;\
 580         int Y2= buf0[i2+1]<<1;
 581
 582 #define YSCALE_YUV_2_RGB1_C(type,alpha) \
 583     YSCALE_YUV_2_PACKED1_C(type,alpha)\
 584     r = (type *)c->table_rV[V];\
 585     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 586     b = (type *)c->table_bU[U];
 587
 588 #define YSCALE_YUV_2_PACKED1B_C(type,alpha) \
 589     for (i=0; i<(dstW>>1); i++) {\
 590         const int i2= 2*i;\
 591         int Y1= buf0[i2  ]>>7;\
 592         int Y2= buf0[i2+1]>>7;\
 593         int U= (ubuf0[i] + ubuf1[i])>>8;\
 594         int V= (vbuf0[i] + vbuf1[i])>>8;\
 595         type av_unused *r, *b, *g;\
 596         int av_unused A1, A2;\
 597         if (alpha) {\
 598             A1= abuf0[i2  ]>>7;\
 599             A2= abuf0[i2+1]>>7;\
 600         }
 601
 602 #define YSCALE_YUV_2_RGB1B_C(type,alpha) \
 603     YSCALE_YUV_2_PACKED1B_C(type,alpha)\
 604     r = (type *)c->table_rV[V];\
 605     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 606     b = (type *)c->table_bU[U];
 607
 608 #define YSCALE_YUV_2_MONO2_C \
 609     const uint8_t * const d128=dither_8x8_220[y&7];\
 610     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 611     for (i=0; i<dstW-7; i+=8) {\
 612         int acc;\
 613         acc =       g[((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19) + d128[0]];\
 614         acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
 615         acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
 616         acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
 617         acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
 618         acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
 619         acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
 620         acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
 621         ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 622         dest++;\
 623     }
 624
 625 #define YSCALE_YUV_2_MONOX_C \
 626     const uint8_t * const d128=dither_8x8_220[y&7];\
 627     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 628     int acc=0;\
 629     for (i=0; i<dstW-1; i+=2) {\
 630         int j;\
 631         int Y1=1<<18;\
 632         int Y2=1<<18;\
 633 \
 634         for (j=0; j<lumFilterSize; j++) {\
 635             Y1 += lumSrc[j][i] * lumFilter[j];\
 636             Y2 += lumSrc[j][i+1] * lumFilter[j];\
 637         }\
 638         Y1>>=19;\
 639         Y2>>=19;\
 640         if ((Y1|Y2)&256) {\
 641             if (Y1>255)   Y1=255;\
 642             else if (Y1<0)Y1=0;\
 643             if (Y2>255)   Y2=255;\
 644             else if (Y2<0)Y2=0;\
 645         }\
 646         acc+= acc + g[Y1+d128[(i+0)&7]];\
 647         acc+= acc + g[Y2+d128[(i+1)&7]];\
 648         if ((i&7)==6) {\
 649             ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 650             dest++;\
 651         }\
 652     }
 653
 654 #define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
 655     switch(c->dstFormat) {\
 656     case PIX_FMT_RGB48BE:\
 657     case PIX_FMT_RGB48LE:\
 658         func(uint8_t,0)\
 659             ((uint8_t*)dest)[ 0]= r[Y1];\
 660             ((uint8_t*)dest)[ 1]= r[Y1];\
 661             ((uint8_t*)dest)[ 2]= g[Y1];\
 662             ((uint8_t*)dest)[ 3]= g[Y1];\
 663             ((uint8_t*)dest)[ 4]= b[Y1];\
 664             ((uint8_t*)dest)[ 5]= b[Y1];\
 665             ((uint8_t*)dest)[ 6]= r[Y2];\
 666             ((uint8_t*)dest)[ 7]= r[Y2];\
 667             ((uint8_t*)dest)[ 8]= g[Y2];\
 668             ((uint8_t*)dest)[ 9]= g[Y2];\
 669             ((uint8_t*)dest)[10]= b[Y2];\
 670             ((uint8_t*)dest)[11]= b[Y2];\
 671             dest+=12;\
 672         }\
 673         break;\
 674     case PIX_FMT_BGR48BE:\
 675     case PIX_FMT_BGR48LE:\
 676         func(uint8_t,0)\
 677             ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
 678             ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
 679             ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
 680             ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
 681             ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
 682             ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
 683             dest+=12;\
 684         }\
 685         break;\
 686     case PIX_FMT_RGBA:\
 687     case PIX_FMT_BGRA:\
 688         if (CONFIG_SMALL) {\
 689             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 690             func(uint32_t,needAlpha)\
 691                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? (A1<<24) : 0);\
 692                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? (A2<<24) : 0);\
 693             }\
 694         } else {\
 695             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 696                 func(uint32_t,1)\
 697                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
 698                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
 699                 }\
 700             } else {\
 701                 func(uint32_t,0)\
 702                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 703                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 704                 }\
 705             }\
 706         }\
 707         break;\
 708     case PIX_FMT_ARGB:\
 709     case PIX_FMT_ABGR:\
 710         if (CONFIG_SMALL) {\
 711             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 712             func(uint32_t,needAlpha)\
 713                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? A1 : 0);\
 714                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? A2 : 0);\
 715             }\
 716         } else {\
 717             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 718                 func(uint32_t,1)\
 719                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + A1;\
 720                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + A2;\
 721                 }\
 722             } else {\
 723                 func(uint32_t,0)\
 724                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 725                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 726                 }\
 727             }\
 728         }                \
 729         break;\
 730     case PIX_FMT_RGB24:\
 731         func(uint8_t,0)\
 732             ((uint8_t*)dest)[0]= r[Y1];\
 733             ((uint8_t*)dest)[1]= g[Y1];\
 734             ((uint8_t*)dest)[2]= b[Y1];\
 735             ((uint8_t*)dest)[3]= r[Y2];\
 736             ((uint8_t*)dest)[4]= g[Y2];\
 737             ((uint8_t*)dest)[5]= b[Y2];\
 738             dest+=6;\
 739         }\
 740         break;\
 741     case PIX_FMT_BGR24:\
 742         func(uint8_t,0)\
 743             ((uint8_t*)dest)[0]= b[Y1];\
 744             ((uint8_t*)dest)[1]= g[Y1];\
 745             ((uint8_t*)dest)[2]= r[Y1];\
 746             ((uint8_t*)dest)[3]= b[Y2];\
 747             ((uint8_t*)dest)[4]= g[Y2];\
 748             ((uint8_t*)dest)[5]= r[Y2];\
 749             dest+=6;\
 750         }\
 751         break;\
 752     case PIX_FMT_RGB565BE:\
 753     case PIX_FMT_RGB565LE:\
 754     case PIX_FMT_BGR565BE:\
 755     case PIX_FMT_BGR565LE:\
 756         {\
 757             const int dr1= dither_2x2_8[y&1    ][0];\
 758             const int dg1= dither_2x2_4[y&1    ][0];\
 759             const int db1= dither_2x2_8[(y&1)^1][0];\
 760             const int dr2= dither_2x2_8[y&1    ][1];\
 761             const int dg2= dither_2x2_4[y&1    ][1];\
 762             const int db2= dither_2x2_8[(y&1)^1][1];\
 763             func(uint16_t,0)\
 764                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 765                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 766             }\
 767         }\
 768         break;\
 769     case PIX_FMT_RGB555BE:\
 770     case PIX_FMT_RGB555LE:\
 771     case PIX_FMT_BGR555BE:\
 772     case PIX_FMT_BGR555LE:\
 773         {\
 774             const int dr1= dither_2x2_8[y&1    ][0];\
 775             const int dg1= dither_2x2_8[y&1    ][1];\
 776             const int db1= dither_2x2_8[(y&1)^1][0];\
 777             const int dr2= dither_2x2_8[y&1    ][1];\
 778             const int dg2= dither_2x2_8[y&1    ][0];\
 779             const int db2= dither_2x2_8[(y&1)^1][1];\
 780             func(uint16_t,0)\
 781                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 782                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 783             }\
 784         }\
 785         break;\
 786     case PIX_FMT_RGB444BE:\
 787     case PIX_FMT_RGB444LE:\
 788     case PIX_FMT_BGR444BE:\
 789     case PIX_FMT_BGR444LE:\
 790         {\
 791             const int dr1= dither_4x4_16[y&3    ][0];\
 792             const int dg1= dither_4x4_16[y&3    ][1];\
 793             const int db1= dither_4x4_16[(y&3)^3][0];\
 794             const int dr2= dither_4x4_16[y&3    ][1];\
 795             const int dg2= dither_4x4_16[y&3    ][0];\
 796             const int db2= dither_4x4_16[(y&3)^3][1];\
 797             func(uint16_t,0)\
 798                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 799                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 800             }\
 801         }\
 802         break;\
 803     case PIX_FMT_RGB8:\
 804     case PIX_FMT_BGR8:\
 805         {\
 806             const uint8_t * const d64= dither_8x8_73[y&7];\
 807             const uint8_t * const d32= dither_8x8_32[y&7];\
 808             func(uint8_t,0)\
 809                 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
 810                 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
 811             }\
 812         }\
 813         break;\
 814     case PIX_FMT_RGB4:\
 815     case PIX_FMT_BGR4:\
 816         {\
 817             const uint8_t * const d64= dither_8x8_73 [y&7];\
 818             const uint8_t * const d128=dither_8x8_220[y&7];\
 819             func(uint8_t,0)\
 820                 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
 821                                  + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
 822             }\
 823         }\
 824         break;\
 825     case PIX_FMT_RGB4_BYTE:\
 826     case PIX_FMT_BGR4_BYTE:\
 827         {\
 828             const uint8_t * const d64= dither_8x8_73 [y&7];\
 829             const uint8_t * const d128=dither_8x8_220[y&7];\
 830             func(uint8_t,0)\
 831                 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
 832                 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
 833             }\
 834         }\
 835         break;\
 836     case PIX_FMT_MONOBLACK:\
 837     case PIX_FMT_MONOWHITE:\
 838         {\
 839             func_monoblack\
 840         }\
 841         break;\
 842     case PIX_FMT_YUYV422:\
 843         func2\
 844             ((uint8_t*)dest)[2*i2+0]= Y1;\
 845             ((uint8_t*)dest)[2*i2+1]= U;\
 846             ((uint8_t*)dest)[2*i2+2]= Y2;\
 847             ((uint8_t*)dest)[2*i2+3]= V;\
 848         }                \
 849         break;\
 850     case PIX_FMT_UYVY422:\
 851         func2\
 852             ((uint8_t*)dest)[2*i2+0]= U;\
 853             ((uint8_t*)dest)[2*i2+1]= Y1;\
 854             ((uint8_t*)dest)[2*i2+2]= V;\
 855             ((uint8_t*)dest)[2*i2+3]= Y2;\
 856         }                \
 857         break;\
 858     case PIX_FMT_GRAY16BE:\
 859         func_g16\
 860             ((uint8_t*)dest)[2*i2+0]= Y1>>8;\
 861             ((uint8_t*)dest)[2*i2+1]= Y1;\
 862             ((uint8_t*)dest)[2*i2+2]= Y2>>8;\
 863             ((uint8_t*)dest)[2*i2+3]= Y2;\
 864         }                \
 865         break;\
 866     case PIX_FMT_GRAY16LE:\
 867         func_g16\
 868             ((uint8_t*)dest)[2*i2+0]= Y1;\
 869             ((uint8_t*)dest)[2*i2+1]= Y1>>8;\
 870             ((uint8_t*)dest)[2*i2+2]= Y2;\
 871             ((uint8_t*)dest)[2*i2+3]= Y2>>8;\
 872         }                \
 873         break;\
 874     }
 875
 876 static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
 877                           const int16_t **lumSrc, int lumFilterSize,
 878                           const int16_t *chrFilter, const int16_t **chrUSrc,
 879                           const int16_t **chrVSrc, int chrFilterSize,
 880                           const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 881 {
 882     int i;
 883     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
 884 }
 885
 886 static inline void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
 887                                    const int16_t **lumSrc, int lumFilterSize,
 888                                    const int16_t *chrFilter, const int16_t **chrUSrc,
 889                                    const int16_t **chrVSrc, int chrFilterSize,
 890                                    const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 891 {
 892     int i;
 893     int step= c->dstFormatBpp/8;
 894     int aidx= 3;
 895
 896     switch(c->dstFormat) {
 897     case PIX_FMT_ARGB:
 898         dest++;
 899         aidx= 0;
 900     case PIX_FMT_RGB24:
 901         aidx--;
 902     case PIX_FMT_RGBA:
 903         if (CONFIG_SMALL) {
 904             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 905             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 906                 dest[aidx]= needAlpha ? A : 255;
 907                 dest[0]= R>>22;
 908                 dest[1]= G>>22;
 909                 dest[2]= B>>22;
 910                 dest+= step;
 911             }
 912         } else {
 913             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 914                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 915                     dest[aidx]= A;
 916                     dest[0]= R>>22;
 917                     dest[1]= G>>22;
 918                     dest[2]= B>>22;
 919                     dest+= step;
 920                 }
 921             } else {
 922                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 923                     dest[aidx]= 255;
 924                     dest[0]= R>>22;
 925                     dest[1]= G>>22;
 926                     dest[2]= B>>22;
 927                     dest+= step;
 928                 }
 929             }
 930         }
 931         break;
 932     case PIX_FMT_ABGR:
 933         dest++;
 934         aidx= 0;
 935     case PIX_FMT_BGR24:
 936         aidx--;
 937     case PIX_FMT_BGRA:
 938         if (CONFIG_SMALL) {
 939             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 940             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 941                 dest[aidx]= needAlpha ? A : 255;
 942                 dest[0]= B>>22;
 943                 dest[1]= G>>22;
 944                 dest[2]= R>>22;
 945                 dest+= step;
 946             }
 947         } else {
 948             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 949                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 950                     dest[aidx]= A;
 951                     dest[0]= B>>22;
 952                     dest[1]= G>>22;
 953                     dest[2]= R>>22;
 954                     dest+= step;
 955                 }
 956             } else {
 957                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 958                     dest[aidx]= 255;
 959                     dest[0]= B>>22;
 960                     dest[1]= G>>22;
 961                     dest[2]= R>>22;
 962                     dest+= step;
 963                 }
 964             }
 965         }
 966         break;
 967     default:
 968         assert(0);
 969     }
 970 }
 971
 972 static void fillPlane(uint8_t* plane, int stride, int width, int height, int y, uint8_t val)
 973 {
 974     int i;
 975     uint8_t *ptr = plane + stride*y;
 976     for (i=0; i<height; i++) {
 977         memset(ptr, val, width);
 978         ptr += stride;
 979     }
 980 }
 981
 982 static void rgb48ToY_c(uint8_t *dst, const uint8_t *src, int width,
 983                        uint32_t *unused)
 984 {
 985     int i;
 986     for (i = 0; i < width; i++) {
 987         int r = src[i*6+0];
 988         int g = src[i*6+2];
 989         int b = src[i*6+4];
 990
 991         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
 992     }
 993 }
 994
 995 static void rgb48ToUV_c(uint8_t *dstU, uint8_t *dstV,
 996                         const uint8_t *src1, const uint8_t *src2,
 997                         int width, uint32_t *unused)
 998 {
 999     int i;
1000     assert(src1==src2);
1001     for (i = 0; i < width; i++) {
1002         int r = src1[6*i + 0];
1003         int g = src1[6*i + 2];
1004         int b = src1[6*i + 4];
1005
1006         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1007         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1008     }
1009 }
1010
1011 static void rgb48ToUV_half_c(uint8_t *dstU, uint8_t *dstV,
1012                              const uint8_t *src1, const uint8_t *src2,
1013                              int width, uint32_t *unused)
1014 {
1015     int i;
1016     assert(src1==src2);
1017     for (i = 0; i < width; i++) {
1018         int r= src1[12*i + 0] + src1[12*i + 6];
1019         int g= src1[12*i + 2] + src1[12*i + 8];
1020         int b= src1[12*i + 4] + src1[12*i + 10];
1021
1022         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1023         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1024     }
1025 }
1026
1027 static void bgr48ToY_c(uint8_t *dst, const uint8_t *src, int width,
1028                        uint32_t *unused)
1029 {
1030     int i;
1031     for (i = 0; i < width; i++) {
1032         int b = src[i*6+0];
1033         int g = src[i*6+2];
1034         int r = src[i*6+4];
1035
1036         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1037     }
1038 }
1039
1040 static void bgr48ToUV_c(uint8_t *dstU, uint8_t *dstV,
1041                         const uint8_t *src1, const uint8_t *src2,
1042                         int width, uint32_t *unused)
1043 {
1044     int i;
1045     for (i = 0; i < width; i++) {
1046         int b = src1[6*i + 0];
1047         int g = src1[6*i + 2];
1048         int r = src1[6*i + 4];
1049
1050         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1051         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1052     }
1053 }
1054
1055 static void bgr48ToUV_half_c(uint8_t *dstU, uint8_t *dstV,
1056                              const uint8_t *src1, const uint8_t *src2,
1057                              int width, uint32_t *unused)
1058 {
1059     int i;
1060     for (i = 0; i < width; i++) {
1061         int b= src1[12*i + 0] + src1[12*i + 6];
1062         int g= src1[12*i + 2] + src1[12*i + 8];
1063         int r= src1[12*i + 4] + src1[12*i + 10];
1064
1065         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1066         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1067     }
1068 }
1069
1070 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1071 static void name ## _c(uint8_t *dst, const uint8_t *src, \
1072                        int width, uint32_t *unused)\
1073 {\
1074     int i;\
1075     for (i=0; i<width; i++) {\
1076         int b= (((const type*)src)[i]>>shb)&maskb;\
1077         int g= (((const type*)src)[i]>>shg)&maskg;\
1078         int r= (((const type*)src)[i]>>shr)&maskr;\
1079 \
1080         dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1081     }\
1082 }
1083
1084 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1085 BGR2Y(uint32_t,bgr321ToY,16,16, 0, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1086 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1087 BGR2Y(uint32_t,rgb321ToY, 0,16,16, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1088 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1089 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1090 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1091 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1092
1093 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1094 {
1095     int i;
1096     for (i=0; i<width; i++) {
1097         dst[i]= src[4*i];
1098     }
1099 }
1100
1101 #define BGR2UV(type, name, shr, shg, shb, shp, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S) \
1102 static void name ## _c(uint8_t *dstU, uint8_t *dstV, \
1103                        const uint8_t *src, const uint8_t *dummy, \
1104                        int width, uint32_t *unused)\
1105 {\
1106     int i;\
1107     for (i=0; i<width; i++) {\
1108         int b= ((((const type*)src)[i]>>shp)&maskb)>>shb;\
1109         int g= ((((const type*)src)[i]>>shp)&maskg)>>shg;\
1110         int r= ((((const type*)src)[i]>>shp)&maskr)>>shr;\
1111 \
1112         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1113         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1114     }\
1115 }\
1116 static void name ## _half_c(uint8_t *dstU, uint8_t *dstV, \
1117                             const uint8_t *src, const uint8_t *dummy, \
1118                             int width, uint32_t *unused)\
1119 {\
1120     int i;\
1121     for (i=0; i<width; i++) {\
1122         int pix0= ((const type*)src)[2*i+0]>>shp;\
1123         int pix1= ((const type*)src)[2*i+1]>>shp;\
1124         int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1125         int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1126         int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1127         g&= maskg|(2*maskg);\
1128 \
1129         g>>=shg;\
1130 \
1131         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1132         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1133     }\
1134 }
1135
1136 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1137 BGR2UV(uint32_t,bgr321ToUV,16, 0, 0, 8, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1138 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1139 BGR2UV(uint32_t,rgb321ToUV, 0, 0,16, 8,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1140 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1141 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1142 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1143 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1144
1145 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1146 {
1147     int i;
1148     for (i=0; i<width; i++) {
1149         int d= src[i];
1150
1151         dst[i]= pal[d] & 0xFF;
1152     }
1153 }
1154
1155 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1156                       const uint8_t *src1, const uint8_t *src2,
1157                       int width, uint32_t *pal)
1158 {
1159     int i;
1160     assert(src1 == src2);
1161     for (i=0; i<width; i++) {
1162         int p= pal[src1[i]];
1163
1164         dstU[i]= p>>8;
1165         dstV[i]= p>>16;
1166     }
1167 }
1168
1169 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1170                           int width, uint32_t *unused)
1171 {
1172     int i, j;
1173     for (i=0; i<width/8; i++) {
1174         int d= ~src[i];
1175         for(j=0; j<8; j++)
1176             dst[8*i+j]= ((d>>(7-j))&1)*255;
1177     }
1178 }
1179
1180 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1181                           int width, uint32_t *unused)
1182 {
1183     int i, j;
1184     for (i=0; i<width/8; i++) {
1185         int d= src[i];
1186         for(j=0; j<8; j++)
1187             dst[8*i+j]= ((d>>(7-j))&1)*255;
1188     }
1189 }
1190
1191 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
1192                        const int16_t *chrUSrc, const int16_t *chrVSrc,
1193                        const int16_t *alpSrc,
1194                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
1195                        uint8_t *aDest, int dstW, int chrDstW)
1196 {
1197     int i;
1198     for (i=0; i<dstW; i++) {
1199         int val= (lumSrc[i]+64)>>7;
1200         dest[i]= av_clip_uint8(val);
1201     }
1202
1203     if (uDest)
1204         for (i=0; i<chrDstW; i++) {
1205             int u=(chrUSrc[i]+64)>>7;
1206             int v=(chrVSrc[i]+64)>>7;
1207             uDest[i]= av_clip_uint8(u);
1208             vDest[i]= av_clip_uint8(v);
1209         }
1210
1211     if (CONFIG_SWSCALE_ALPHA && aDest)
1212         for (i=0; i<dstW; i++) {
1213             int val= (alpSrc[i]+64)>>7;
1214             aDest[i]= av_clip_uint8(val);
1215         }
1216 }
1217
1218 /**
1219  * vertical bilinear scale YV12 to RGB
1220  */
1221 static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
1222                           const uint16_t *buf1, const uint16_t *ubuf0,
1223                           const uint16_t *ubuf1, const uint16_t *vbuf0,
1224                           const uint16_t *vbuf1, const uint16_t *abuf0,
1225                           const uint16_t *abuf1, uint8_t *dest, int dstW,
1226                           int yalpha, int uvalpha, int y)
1227 {
1228     int  yalpha1=4095- yalpha;
1229     int uvalpha1=4095-uvalpha;
1230     int i;
1231
1232     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1233 }
1234
1235 /**
1236  * YV12 to RGB without scaling or interpolating
1237  */
1238 static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
1239                           const uint16_t *ubuf0, const uint16_t *ubuf1,
1240                           const uint16_t *vbuf0, const uint16_t *vbuf1,
1241                           const uint16_t *abuf0, uint8_t *dest, int dstW,
1242                           int uvalpha, enum PixelFormat dstFormat,
1243                           int flags, int y)
1244 {
1245     const int yalpha1=0;
1246     int i;
1247
1248     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1249     const int yalpha= 4096; //FIXME ...
1250
1251     if (uvalpha < 2048) {
1252         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1253     } else {
1254         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1255     }
1256 }
1257
1258 //FIXME yuy2* can read up to 7 samples too much
1259
1260 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1261                       uint32_t *unused)
1262 {
1263     int i;
1264     for (i=0; i<width; i++)
1265         dst[i]= src[2*i];
1266 }
1267
1268 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1269                        const uint8_t *src2, int width, uint32_t *unused)
1270 {
1271     int i;
1272     for (i=0; i<width; i++) {
1273         dstU[i]= src1[4*i + 1];
1274         dstV[i]= src1[4*i + 3];
1275     }
1276     assert(src1 == src2);
1277 }
1278
1279 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1280                      const uint8_t *src2, int width, uint32_t *unused)
1281 {
1282     int i;
1283     for (i=0; i<width; i++) {
1284         dstU[i]= src1[2*i + 1];
1285         dstV[i]= src2[2*i + 1];
1286     }
1287 }
1288
1289 /* This is almost identical to the previous, end exists only because
1290  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1291 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1292                       uint32_t *unused)
1293 {
1294     int i;
1295     for (i=0; i<width; i++)
1296         dst[i]= src[2*i+1];
1297 }
1298
1299 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1300                        const uint8_t *src2, int width, uint32_t *unused)
1301 {
1302     int i;
1303     for (i=0; i<width; i++) {
1304         dstU[i]= src1[4*i + 0];
1305         dstV[i]= src1[4*i + 2];
1306     }
1307     assert(src1 == src2);
1308 }
1309
1310 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1311                      const uint8_t *src2, int width, uint32_t *unused)
1312 {
1313     int i;
1314     for (i=0; i<width; i++) {
1315         dstU[i]= src1[2*i];
1316         dstV[i]= src2[2*i];
1317     }
1318 }
1319
1320 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1321                                         const uint8_t *src, int width)
1322 {
1323     int i;
1324     for (i = 0; i < width; i++) {
1325         dst1[i] = src[2*i+0];
1326         dst2[i] = src[2*i+1];
1327     }
1328 }
1329
1330 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1331                        const uint8_t *src1, const uint8_t *src2,
1332                        int width, uint32_t *unused)
1333 {
1334     nvXXtoUV_c(dstU, dstV, src1, width);
1335 }
1336
1337 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1338                        const uint8_t *src1, const uint8_t *src2,
1339                        int width, uint32_t *unused)
1340 {
1341     nvXXtoUV_c(dstV, dstU, src1, width);
1342 }
1343
1344 // FIXME Maybe dither instead.
1345 #define YUV_NBPS(depth, endianness, rfunc) \
1346 static void endianness ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1347                                           const uint8_t *_srcU, const uint8_t *_srcV, \
1348                                           int width, uint32_t *unused) \
1349 { \
1350     int i; \
1351     const uint16_t *srcU = (const uint16_t*)_srcU; \
1352     const uint16_t *srcV = (const uint16_t*)_srcV; \
1353     for (i = 0; i < width; i++) { \
1354         dstU[i] = rfunc(&srcU[i])>>(depth-8); \
1355         dstV[i] = rfunc(&srcV[i])>>(depth-8); \
1356     } \
1357 } \
1358 \
1359 static void endianness ## depth ## ToY_c(uint8_t *dstY, const uint8_t *_srcY, \
1360                                          int width, uint32_t *unused) \
1361 { \
1362     int i; \
1363     const uint16_t *srcY = (const uint16_t*)_srcY; \
1364     for (i = 0; i < width; i++) \
1365         dstY[i] = rfunc(&srcY[i])>>(depth-8); \
1366 } \
1367
1368 YUV_NBPS( 9, LE, AV_RL16)
1369 YUV_NBPS( 9, BE, AV_RB16)
1370 YUV_NBPS(10, LE, AV_RL16)
1371 YUV_NBPS(10, BE, AV_RB16)
1372
1373 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1374                        int width, uint32_t *unused)
1375 {
1376     int i;
1377     for (i=0; i<width; i++) {
1378         int b= src[i*3+0];
1379         int g= src[i*3+1];
1380         int r= src[i*3+2];
1381
1382         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1383     }
1384 }
1385
1386 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1387                         const uint8_t *src2, int width, uint32_t *unused)
1388 {
1389     int i;
1390     for (i=0; i<width; i++) {
1391         int b= src1[3*i + 0];
1392         int g= src1[3*i + 1];
1393         int r= src1[3*i + 2];
1394
1395         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1396         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1397     }
1398     assert(src1 == src2);
1399 }
1400
1401 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1402                              const uint8_t *src2, int width, uint32_t *unused)
1403 {
1404     int i;
1405     for (i=0; i<width; i++) {
1406         int b= src1[6*i + 0] + src1[6*i + 3];
1407         int g= src1[6*i + 1] + src1[6*i + 4];
1408         int r= src1[6*i + 2] + src1[6*i + 5];
1409
1410         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1411         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1412     }
1413     assert(src1 == src2);
1414 }
1415
1416 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1417                        uint32_t *unused)
1418 {
1419     int i;
1420     for (i=0; i<width; i++) {
1421         int r= src[i*3+0];
1422         int g= src[i*3+1];
1423         int b= src[i*3+2];
1424
1425         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1426     }
1427 }
1428
1429 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1430                         const uint8_t *src2, int width, uint32_t *unused)
1431 {
1432     int i;
1433     assert(src1==src2);
1434     for (i=0; i<width; i++) {
1435         int r= src1[3*i + 0];
1436         int g= src1[3*i + 1];
1437         int b= src1[3*i + 2];
1438
1439         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1440         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1441     }
1442 }
1443
1444 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1445                              const uint8_t *src2, int width, uint32_t *unused)
1446 {
1447     int i;
1448     assert(src1==src2);
1449     for (i=0; i<width; i++) {
1450         int r= src1[6*i + 0] + src1[6*i + 3];
1451         int g= src1[6*i + 1] + src1[6*i + 4];
1452         int b= src1[6*i + 2] + src1[6*i + 5];
1453
1454         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1455         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1456     }
1457 }
1458
1459
1460 // bilinear / bicubic scaling
1461 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1462                      int srcW, int xInc,
1463                      const int16_t *filter, const int16_t *filterPos,
1464                      int filterSize)
1465 {
1466     int i;
1467     for (i=0; i<dstW; i++) {
1468         int j;
1469         int srcPos= filterPos[i];
1470         int val=0;
1471         for (j=0; j<filterSize; j++) {
1472             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1473         }
1474         //filter += hFilterSize;
1475         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1476         //dst[i] = val>>7;
1477     }
1478 }
1479
1480 //FIXME all pal and rgb srcFormats could do this convertion as well
1481 //FIXME all scalers more complex than bilinear could do half of this transform
1482 static void chrRangeToJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1483 {
1484     int i;
1485     for (i = 0; i < width; i++) {
1486         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1487         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1488     }
1489 }
1490 static void chrRangeFromJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1491 {
1492     int i;
1493     for (i = 0; i < width; i++) {
1494         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1495         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1496     }
1497 }
1498 static void lumRangeToJpeg_c(uint16_t *dst, int width)
1499 {
1500     int i;
1501     for (i = 0; i < width; i++)
1502         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1503 }
1504 static void lumRangeFromJpeg_c(uint16_t *dst, int width)
1505 {
1506     int i;
1507     for (i = 0; i < width; i++)
1508         dst[i] = (dst[i]*14071 + 33561947)>>14;
1509 }
1510
1511 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1512                            const uint8_t *src, int srcW, int xInc)
1513 {
1514     int i;
1515     unsigned int xpos=0;
1516     for (i=0;i<dstWidth;i++) {
1517         register unsigned int xx=xpos>>16;
1518         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1519         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1520         xpos+=xInc;
1521     }
1522 }
1523
1524 // *** horizontal scale Y line to temp buffer
1525 static inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1526                            const uint8_t *src, int srcW, int xInc,
1527                            const int16_t *hLumFilter,
1528                            const int16_t *hLumFilterPos, int hLumFilterSize,
1529                            uint8_t *formatConvBuffer,
1530                            uint32_t *pal, int isAlpha)
1531 {
1532     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1533     void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1534
1535     src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
1536
1537     if (toYV12) {
1538         toYV12(formatConvBuffer, src, srcW, pal);
1539         src= formatConvBuffer;
1540     }
1541
1542     if (!c->hyscale_fast) {
1543         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1544     } else { // fast bilinear upscale / crap downscale
1545         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1546     }
1547
1548     if (convertRange)
1549         convertRange(dst, dstWidth);
1550 }
1551
1552 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1553                            int dstWidth, const uint8_t *src1,
1554                            const uint8_t *src2, int srcW, int xInc)
1555 {
1556     int i;
1557     unsigned int xpos=0;
1558     for (i=0;i<dstWidth;i++) {
1559         register unsigned int xx=xpos>>16;
1560         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1561         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1562         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1563         xpos+=xInc;
1564     }
1565 }
1566
1567 static inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1568                            const uint8_t *src1, const uint8_t *src2,
1569                            int srcW, int xInc, const int16_t *hChrFilter,
1570                            const int16_t *hChrFilterPos, int hChrFilterSize,
1571                            uint8_t *formatConvBuffer, uint32_t *pal)
1572 {
1573
1574     src1 += c->chrSrcOffset;
1575     src2 += c->chrSrcOffset;
1576
1577     if (c->chrToYV12) {
1578         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1579         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1580         src1= formatConvBuffer;
1581         src2= buf2;
1582     }
1583
1584     if (!c->hcscale_fast) {
1585         c->hScale(dst1, dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1586         c->hScale(dst2, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1587     } else { // fast bilinear upscale / crap downscale
1588         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1589     }
1590
1591     if (c->chrConvertRange)
1592         c->chrConvertRange(dst1, dst2, dstWidth);
1593 }
1594
1595 #define DEBUG_SWSCALE_BUFFERS 0
1596 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
1597
1598 static int swScale(SwsContext *c, const uint8_t* src[],
1599                    int srcStride[], int srcSliceY,
1600                    int srcSliceH, uint8_t* dst[], int dstStride[])
1601 {
1602     /* load a few things into local vars to make the code more readable? and faster */
1603     const int srcW= c->srcW;
1604     const int dstW= c->dstW;
1605     const int dstH= c->dstH;
1606     const int chrDstW= c->chrDstW;
1607     const int chrSrcW= c->chrSrcW;
1608     const int lumXInc= c->lumXInc;
1609     const int chrXInc= c->chrXInc;
1610     const enum PixelFormat dstFormat= c->dstFormat;
1611     const int flags= c->flags;
1612     int16_t *vLumFilterPos= c->vLumFilterPos;
1613     int16_t *vChrFilterPos= c->vChrFilterPos;
1614     int16_t *hLumFilterPos= c->hLumFilterPos;
1615     int16_t *hChrFilterPos= c->hChrFilterPos;
1616     int16_t *vLumFilter= c->vLumFilter;
1617     int16_t *vChrFilter= c->vChrFilter;
1618     int16_t *hLumFilter= c->hLumFilter;
1619     int16_t *hChrFilter= c->hChrFilter;
1620     int32_t *lumMmxFilter= c->lumMmxFilter;
1621     int32_t *chrMmxFilter= c->chrMmxFilter;
1622     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
1623     const int vLumFilterSize= c->vLumFilterSize;
1624     const int vChrFilterSize= c->vChrFilterSize;
1625     const int hLumFilterSize= c->hLumFilterSize;
1626     const int hChrFilterSize= c->hChrFilterSize;
1627     int16_t **lumPixBuf= c->lumPixBuf;
1628     int16_t **chrUPixBuf= c->chrUPixBuf;
1629     int16_t **chrVPixBuf= c->chrVPixBuf;
1630     int16_t **alpPixBuf= c->alpPixBuf;
1631     const int vLumBufSize= c->vLumBufSize;
1632     const int vChrBufSize= c->vChrBufSize;
1633     uint8_t *formatConvBuffer= c->formatConvBuffer;
1634     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
1635     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
1636     int lastDstY;
1637     uint32_t *pal=c->pal_yuv;
1638
1639     /* vars which will change and which we need to store back in the context */
1640     int dstY= c->dstY;
1641     int lumBufIndex= c->lumBufIndex;
1642     int chrBufIndex= c->chrBufIndex;
1643     int lastInLumBuf= c->lastInLumBuf;
1644     int lastInChrBuf= c->lastInChrBuf;
1645
1646     if (isPacked(c->srcFormat)) {
1647         src[0]=
1648         src[1]=
1649         src[2]=
1650         src[3]= src[0];
1651         srcStride[0]=
1652         srcStride[1]=
1653         srcStride[2]=
1654         srcStride[3]= srcStride[0];
1655     }
1656     srcStride[1]<<= c->vChrDrop;
1657     srcStride[2]<<= c->vChrDrop;
1658
1659     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
1660                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
1661                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
1662     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
1663                    srcSliceY,    srcSliceH,    dstY,    dstH);
1664     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
1665                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
1666
1667     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
1668         static int warnedAlready=0; //FIXME move this into the context perhaps
1669         if (flags & SWS_PRINT_INFO && !warnedAlready) {
1670             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
1671                    "         ->cannot do aligned memory accesses anymore\n");
1672             warnedAlready=1;
1673         }
1674     }
1675
1676     /* Note the user might start scaling the picture in the middle so this
1677        will not get executed. This is not really intended but works
1678        currently, so people might do it. */
1679     if (srcSliceY ==0) {
1680         lumBufIndex=-1;
1681         chrBufIndex=-1;
1682         dstY=0;
1683         lastInLumBuf= -1;
1684         lastInChrBuf= -1;
1685     }
1686
1687     lastDstY= dstY;
1688
1689     for (;dstY < dstH; dstY++) {
1690         unsigned char *dest =dst[0]+dstStride[0]*dstY;
1691         const int chrDstY= dstY>>c->chrDstVSubSample;
1692         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
1693         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
1694         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
1695
1696         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
1697         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
1698         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
1699         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
1700         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
1701         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
1702         int enough_lines;
1703
1704         //handle holes (FAST_BILINEAR & weird filters)
1705         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1706         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1707         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
1708         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
1709
1710         DEBUG_BUFFERS("dstY: %d\n", dstY);
1711         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
1712                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
1713         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
1714                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
1715
1716         // Do we have enough lines in this slice to output the dstY line
1717         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
1718
1719         if (!enough_lines) {
1720             lastLumSrcY = srcSliceY + srcSliceH - 1;
1721             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
1722             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
1723                                             lastLumSrcY, lastChrSrcY);
1724         }
1725
1726         //Do horizontal scaling
1727         while(lastInLumBuf < lastLumSrcY) {
1728             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
1729             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
1730             lumBufIndex++;
1731             assert(lumBufIndex < 2*vLumBufSize);
1732             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
1733             assert(lastInLumBuf + 1 - srcSliceY >= 0);
1734             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
1735                     hLumFilter, hLumFilterPos, hLumFilterSize,
1736                     formatConvBuffer,
1737                     pal, 0);
1738             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
1739                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
1740                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
1741                         formatConvBuffer,
1742                         pal, 1);
1743             lastInLumBuf++;
1744             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
1745                                lumBufIndex,    lastInLumBuf);
1746         }
1747         while(lastInChrBuf < lastChrSrcY) {
1748             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
1749             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
1750             chrBufIndex++;
1751             assert(chrBufIndex < 2*vChrBufSize);
1752             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
1753             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
1754             //FIXME replace parameters through context struct (some at least)
1755
1756             if (c->needs_hcscale)
1757                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
1758                           chrDstW, src1, src2, chrSrcW, chrXInc,
1759                           hChrFilter, hChrFilterPos, hChrFilterSize,
1760                           formatConvBuffer, pal);
1761             lastInChrBuf++;
1762             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
1763                                chrBufIndex,    lastInChrBuf);
1764         }
1765         //wrap buf index around to stay inside the ring buffer
1766         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
1767         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
1768         if (!enough_lines)
1769             break; //we can't output a dstY line so let's try with the next slice
1770
1771 #if HAVE_MMX
1772         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
1773 #endif
1774         if (dstY < dstH-2) {
1775             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1776             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1777             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1778             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1779             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1780                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1781                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
1782                 c->yuv2nv12X(c,
1783                              vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1784                              vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1785                              dest, uDest, dstW, chrDstW, dstFormat);
1786             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
1787                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1788                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1789                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
1790                     const int16_t *lumBuf = lumSrcPtr[0];
1791                     const int16_t *chrUBuf= chrUSrcPtr[0];
1792                     const int16_t *chrVBuf= chrVSrcPtr[0];
1793                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
1794                     c->yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
1795                                 uDest, vDest, aDest, dstW, chrDstW);
1796                 } else { //General YV12
1797                     c->yuv2yuvX(c,
1798                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1799                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
1800                                 chrVSrcPtr, vChrFilterSize,
1801                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
1802                 }
1803             } else {
1804                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
1805                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1806                 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
1807                     int chrAlpha= vChrFilter[2*dstY+1];
1808                     if(flags & SWS_FULL_CHR_H_INT) {
1809                         yuv2rgbX_c_full(c, //FIXME write a packed1_full function
1810                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1811                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr,
1812                                         chrVSrcPtr, vChrFilterSize,
1813                                         alpSrcPtr, dest, dstW, dstY);
1814                     } else {
1815                         c->yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
1816                                        *chrVSrcPtr, *(chrVSrcPtr+1),
1817                                        alpPixBuf ? *alpSrcPtr : NULL,
1818                                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
1819                     }
1820                 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
1821                     int lumAlpha= vLumFilter[2*dstY+1];
1822                     int chrAlpha= vChrFilter[2*dstY+1];
1823                     lumMmxFilter[2]=
1824                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
1825                     chrMmxFilter[2]=
1826                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
1827                     if(flags & SWS_FULL_CHR_H_INT) {
1828                         yuv2rgbX_c_full(c, //FIXME write a packed2_full function
1829                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1830                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1831                                         alpSrcPtr, dest, dstW, dstY);
1832                     } else {
1833                         c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
1834                                        *chrVSrcPtr, *(chrVSrcPtr+1),
1835                                        alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
1836                                        dest, dstW, lumAlpha, chrAlpha, dstY);
1837                     }
1838                 } else { //general RGB
1839                     if(flags & SWS_FULL_CHR_H_INT) {
1840                         yuv2rgbX_c_full(c,
1841                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1842                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1843                                         alpSrcPtr, dest, dstW, dstY);
1844                     } else {
1845                         c->yuv2packedX(c,
1846                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1847                                        vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1848                                        alpSrcPtr, dest, dstW, dstY);
1849                     }
1850                 }
1851             }
1852         } else { // hmm looks like we can't use MMX here without overwriting this array's tail
1853             const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1854             const int16_t **chrUSrcPtr= (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1855             const int16_t **chrVSrcPtr= (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1856             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1857             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1858                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1859                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
1860                 yuv2nv12X_c(c, vLumFilter+dstY*vLumFilterSize,
1861                             lumSrcPtr, vLumFilterSize,
1862                             vChrFilter+chrDstY*vChrFilterSize,
1863                             chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1864                             dest, uDest, dstW, chrDstW, dstFormat);
1865             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
1866                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1867                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1868                 if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
1869                     yuv2yuvX16_c(c, vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1870                                  vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1871                                  alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW,
1872                                  dstFormat);
1873                 } else {
1874                     yuv2yuvX_c(c, vLumFilter+dstY*vLumFilterSize,
1875                                lumSrcPtr, vLumFilterSize,
1876                                vChrFilter+chrDstY*vChrFilterSize,
1877                                chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1878                                alpSrcPtr, dest, uDest, vDest, aDest,
1879                                dstW, chrDstW);
1880                 }
1881             } else {
1882                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
1883                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1884                 if(flags & SWS_FULL_CHR_H_INT) {
1885                     yuv2rgbX_c_full(c,
1886                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1887                                     vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1888                                     alpSrcPtr, dest, dstW, dstY);
1889                 } else {
1890                     yuv2packedX_c(c,
1891                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1892                                   vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1893                                   alpSrcPtr, dest, dstW, dstY);
1894                 }
1895             }
1896         }
1897     }
1898
1899     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
1900         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
1901
1902 #if HAVE_MMX2
1903     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
1904         __asm__ volatile("sfence":::"memory");
1905 #endif
1906     emms_c();
1907
1908     /* store changed local vars back in the context */
1909     c->dstY= dstY;
1910     c->lumBufIndex= lumBufIndex;
1911     c->chrBufIndex= chrBufIndex;
1912     c->lastInLumBuf= lastInLumBuf;
1913     c->lastInChrBuf= lastInChrBuf;
1914
1915     return dstY - lastDstY;
1916 }
1917
1918 static void sws_init_swScale_c(SwsContext *c)
1919 {
1920     enum PixelFormat srcFormat = c->srcFormat,
1921                      dstFormat = c->dstFormat;
1922
1923     c->yuv2nv12X    = yuv2nv12X_c;
1924     if (is16BPS(dstFormat)) {
1925         c->yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
1926     } else if (is9_OR_10BPS(dstFormat)) {
1927         if (dstFormat == PIX_FMT_YUV420P9BE || dstFormat == PIX_FMT_YUV420P9LE) {
1928             c->yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
1929         } else {
1930             c->yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
1931         }
1932     } else {
1933         c->yuv2yuv1     = yuv2yuv1_c;
1934         c->yuv2yuvX     = yuv2yuvX_c;
1935     }
1936     c->yuv2packed1  = yuv2packed1_c;
1937     c->yuv2packed2  = yuv2packed2_c;
1938     c->yuv2packedX  = yuv2packedX_c;
1939
1940     c->hScale       = hScale_c;
1941
1942     if (c->flags & SWS_FAST_BILINEAR)
1943     {
1944         c->hyscale_fast = hyscale_fast_c;
1945         c->hcscale_fast = hcscale_fast_c;
1946     }
1947
1948     c->chrToYV12 = NULL;
1949     switch(srcFormat) {
1950         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
1951         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
1952         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
1953         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
1954         case PIX_FMT_RGB8     :
1955         case PIX_FMT_BGR8     :
1956         case PIX_FMT_PAL8     :
1957         case PIX_FMT_BGR4_BYTE:
1958         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
1959         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
1960         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
1961         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
1962         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
1963         case PIX_FMT_YUV420P16BE:
1964         case PIX_FMT_YUV422P16BE:
1965         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
1966         case PIX_FMT_YUV420P16LE:
1967         case PIX_FMT_YUV422P16LE:
1968         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
1969     }
1970     if (c->chrSrcHSubSample) {
1971         switch(srcFormat) {
1972         case PIX_FMT_RGB48BE:
1973         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half_c; break;
1974         case PIX_FMT_BGR48BE:
1975         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half_c; break;
1976         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half_c;  break;
1977         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half_c; break;
1978         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_half_c; break;
1979         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half_c; break;
1980         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half_c; break;
1981         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half_c;  break;
1982         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half_c; break;
1983         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_half_c; break;
1984         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half_c; break;
1985         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half_c; break;
1986         }
1987     } else {
1988         switch(srcFormat) {
1989         case PIX_FMT_RGB48BE:
1990         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_c; break;
1991         case PIX_FMT_BGR48BE:
1992         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_c; break;
1993         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_c;  break;
1994         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_c; break;
1995         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_c; break;
1996         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_c; break;
1997         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_c; break;
1998         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_c;  break;
1999         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_c; break;
2000         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_c; break;
2001         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_c; break;
2002         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_c; break;
2003         }
2004     }
2005
2006     c->lumToYV12 = NULL;
2007     c->alpToYV12 = NULL;
2008     switch (srcFormat) {
2009     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
2010     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
2011     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
2012     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
2013     case PIX_FMT_YUYV422  :
2014     case PIX_FMT_YUV420P16BE:
2015     case PIX_FMT_YUV422P16BE:
2016     case PIX_FMT_YUV444P16BE:
2017     case PIX_FMT_Y400A    :
2018     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
2019     case PIX_FMT_UYVY422  :
2020     case PIX_FMT_YUV420P16LE:
2021     case PIX_FMT_YUV422P16LE:
2022     case PIX_FMT_YUV444P16LE:
2023     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c; break;
2024     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c; break;
2025     case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY_c; break;
2026     case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY_c; break;
2027     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c; break;
2028     case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY_c; break;
2029     case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY_c; break;
2030     case PIX_FMT_RGB8     :
2031     case PIX_FMT_BGR8     :
2032     case PIX_FMT_PAL8     :
2033     case PIX_FMT_BGR4_BYTE:
2034     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2035     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2036     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2037     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2038     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2039     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2040     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2041     case PIX_FMT_RGB48BE:
2042     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY_c; break;
2043     case PIX_FMT_BGR48BE:
2044     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY_c; break;
2045     }
2046     if (c->alpPixBuf) {
2047         switch (srcFormat) {
2048         case PIX_FMT_RGB32  :
2049         case PIX_FMT_RGB32_1:
2050         case PIX_FMT_BGR32  :
2051         case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA_c; break;
2052         case PIX_FMT_Y400A  : c->alpToYV12 = yuy2ToY_c; break;
2053         }
2054     }
2055
2056     switch (srcFormat) {
2057     case PIX_FMT_Y400A  :
2058         c->alpSrcOffset = 1;
2059         break;
2060     case PIX_FMT_RGB32  :
2061     case PIX_FMT_BGR32  :
2062         c->alpSrcOffset = 3;
2063         break;
2064     case PIX_FMT_RGB48LE:
2065     case PIX_FMT_BGR48LE:
2066         c->lumSrcOffset = 1;
2067         c->chrSrcOffset = 1;
2068         c->alpSrcOffset = 1;
2069         break;
2070     }
2071
2072     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2073         if (c->srcRange) {
2074             c->lumConvertRange = lumRangeFromJpeg_c;
2075             c->chrConvertRange = chrRangeFromJpeg_c;
2076         } else {
2077             c->lumConvertRange = lumRangeToJpeg_c;
2078             c->chrConvertRange = chrRangeToJpeg_c;
2079         }
2080     }
2081
2082     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2083           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2084         c->needs_hcscale = 1;
2085 }
2086
2087 SwsFunc ff_getSwsFunc(SwsContext *c)
2088 {
2089     sws_init_swScale_c(c);
2090
2091     if (HAVE_MMX)
2092         ff_sws_init_swScale_mmx(c);
2093     if (HAVE_ALTIVEC)
2094         ff_sws_init_swScale_altivec(c);
2095
2096     return swScale;
2097 }