git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 201                       int chrFilterSize, const int16_t **alpSrc,
 202                       uint16_t *dest, uint16_t *uDest, uint16_t *vDest,
 203                       uint16_t *aDest, int dstW, int chrDstW,
 204                       int big_endian, int output_bits)
 205 {
 206     //FIXME Optimize (just quickly written not optimized..)
 207     int i;
 208     int shift = 11 + 16 - output_bits;
 209
 210 #define output_pixel(pos, val) \
 211     if (big_endian) { \
 212         if (output_bits == 16) { \
 213             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 214         } else { \
 215             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 216         } \
 217     } else { \
 218         if (output_bits == 16) { \
 219             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 220         } else { \
 221             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 222         } \
 223     }
 224     for (i = 0; i < dstW; i++) {
 225         int val = 1 << (26-output_bits);
 226         int j;
 227
 228         for (j = 0; j < lumFilterSize; j++)
 229             val += lumSrc[j][i] * lumFilter[j];
 230
 231         output_pixel(&dest[i], val);
 232     }
 233
 234     if (uDest) {
 235         for (i = 0; i < chrDstW; i++) {
 236             int u = 1 << (26-output_bits);
 237             int v = 1 << (26-output_bits);
 238             int j;
 239
 240             for (j = 0; j < chrFilterSize; j++) {
 241                 u += chrUSrc[j][i] * chrFilter[j];
 242                 v += chrVSrc[j][i] * chrFilter[j];
 243             }
 244
 245             output_pixel(&uDest[i], u);
 246             output_pixel(&vDest[i], v);
 247         }
 248     }
 249
 250     if (CONFIG_SWSCALE_ALPHA && aDest) {
 251         for (i = 0; i < dstW; i++) {
 252             int val = 1 << (26-output_bits);
 253             int j;
 254
 255             for (j = 0; j < lumFilterSize; j++)
 256                 val += alpSrc[j][i] * lumFilter[j];
 257
 258             output_pixel(&aDest[i], val);
 259         }
 260     }
 261 #undef output_pixel
 262 }
 263
 264 #define yuv2NBPS(bits, BE_LE, is_be) \
 265 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 266                               const int16_t **lumSrc, int lumFilterSize, \
 267                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 268                               const int16_t **chrVSrc, \
 269                               int chrFilterSize, const int16_t **alpSrc, \
 270                               uint8_t *_dest, uint8_t *_uDest, uint8_t *_vDest, \
 271                               uint8_t *_aDest, int dstW, int chrDstW) \
 272 { \
 273     uint16_t *dest  = (uint16_t *) _dest,  *uDest = (uint16_t *) _uDest, \
 274              *vDest = (uint16_t *) _vDest, *aDest = (uint16_t *) _aDest; \
 275     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 276                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 277                           alpSrc, \
 278                           dest, uDest, vDest, aDest, \
 279                           dstW, chrDstW, is_be, bits); \
 280 }
 281 yuv2NBPS( 9, BE, 1);
 282 yuv2NBPS( 9, LE, 0);
 283 yuv2NBPS(10, BE, 1);
 284 yuv2NBPS(10, LE, 0);
 285 yuv2NBPS(16, BE, 1);
 286 yuv2NBPS(16, LE, 0);
 287
 288 static inline void yuv2yuvX16_c(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 289                                 const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize,
 290                                 const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, int dstW, int chrDstW,
 291                                 enum PixelFormat dstFormat)
 292 {
 293 #define conv16(bits) \
 294     if (isBE(dstFormat)) { \
 295         yuv2yuvX ## bits ## BE_c(c, lumFilter, lumSrc, lumFilterSize, \
 296                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 297                                  alpSrc, \
 298                                  dest, uDest, vDest, aDest, \
 299                                  dstW, chrDstW); \
 300     } else { \
 301         yuv2yuvX ## bits ## LE_c(c, lumFilter, lumSrc, lumFilterSize, \
 302                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 303                                  alpSrc, \
 304                                  dest, uDest, vDest, aDest, \
 305                                  dstW, chrDstW); \
 306     }
 307     if (is16BPS(dstFormat)) {
 308         conv16(16);
 309     } else if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
 310         conv16(9);
 311     } else {
 312         conv16(10);
 313     }
 314 #undef conv16
 315 }
 316
 317 static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 318                               const int16_t **lumSrc, int lumFilterSize,
 319                               const int16_t *chrFilter, const int16_t **chrUSrc,
 320                               const int16_t **chrVSrc,
 321                               int chrFilterSize, const int16_t **alpSrc,
 322                               uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 323                               uint8_t *aDest, int dstW, int chrDstW)
 324 {
 325     //FIXME Optimize (just quickly written not optimized..)
 326     int i;
 327     for (i=0; i<dstW; i++) {
 328         int val=1<<18;
 329         int j;
 330         for (j=0; j<lumFilterSize; j++)
 331             val += lumSrc[j][i] * lumFilter[j];
 332
 333         dest[i]= av_clip_uint8(val>>19);
 334     }
 335
 336     if (uDest)
 337         for (i=0; i<chrDstW; i++) {
 338             int u=1<<18;
 339             int v=1<<18;
 340             int j;
 341             for (j=0; j<chrFilterSize; j++) {
 342                 u += chrUSrc[j][i] * chrFilter[j];
 343                 v += chrVSrc[j][i] * chrFilter[j];
 344             }
 345
 346             uDest[i]= av_clip_uint8(u>>19);
 347             vDest[i]= av_clip_uint8(v>>19);
 348         }
 349
 350     if (CONFIG_SWSCALE_ALPHA && aDest)
 351         for (i=0; i<dstW; i++) {
 352             int val=1<<18;
 353             int j;
 354             for (j=0; j<lumFilterSize; j++)
 355                 val += alpSrc[j][i] * lumFilter[j];
 356
 357             aDest[i]= av_clip_uint8(val>>19);
 358         }
 359
 360 }
 361
 362 static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 363                                const int16_t **lumSrc, int lumFilterSize,
 364                                const int16_t *chrFilter, const int16_t **chrUSrc,
 365                                const int16_t **chrVSrc, int chrFilterSize,
 366                                const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest,
 367                                uint8_t *vDest, uint8_t *aDest,
 368                                int dstW, int chrDstW)
 369 {
 370     enum PixelFormat dstFormat = c->dstFormat;
 371
 372     //FIXME Optimize (just quickly written not optimized..)
 373     int i;
 374     for (i=0; i<dstW; i++) {
 375         int val=1<<18;
 376         int j;
 377         for (j=0; j<lumFilterSize; j++)
 378             val += lumSrc[j][i] * lumFilter[j];
 379
 380         dest[i]= av_clip_uint8(val>>19);
 381     }
 382
 383     if (!uDest)
 384         return;
 385
 386     if (dstFormat == PIX_FMT_NV12)
 387         for (i=0; i<chrDstW; i++) {
 388             int u=1<<18;
 389             int v=1<<18;
 390             int j;
 391             for (j=0; j<chrFilterSize; j++) {
 392                 u += chrUSrc[j][i] * chrFilter[j];
 393                 v += chrVSrc[j][i] * chrFilter[j];
 394             }
 395
 396             uDest[2*i]= av_clip_uint8(u>>19);
 397             uDest[2*i+1]= av_clip_uint8(v>>19);
 398         }
 399     else
 400         for (i=0; i<chrDstW; i++) {
 401             int u=1<<18;
 402             int v=1<<18;
 403             int j;
 404             for (j=0; j<chrFilterSize; j++) {
 405                 u += chrUSrc[j][i] * chrFilter[j];
 406                 v += chrVSrc[j][i] * chrFilter[j];
 407             }
 408
 409             uDest[2*i]= av_clip_uint8(v>>19);
 410             uDest[2*i+1]= av_clip_uint8(u>>19);
 411         }
 412 }
 413
 414 #define YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha) \
 415     for (i=0; i<(dstW>>1); i++) {\
 416         int j;\
 417         int Y1 = 1<<18;\
 418         int Y2 = 1<<18;\
 419         int U  = 1<<18;\
 420         int V  = 1<<18;\
 421         int av_unused A1, A2;\
 422         type av_unused *r, *b, *g;\
 423         const int i2= 2*i;\
 424         \
 425         for (j=0; j<lumFilterSize; j++) {\
 426             Y1 += lumSrc[j][i2] * lumFilter[j];\
 427             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 428         }\
 429         for (j=0; j<chrFilterSize; j++) {\
 430             U += chrUSrc[j][i] * chrFilter[j];\
 431             V += chrVSrc[j][i] * chrFilter[j];\
 432         }\
 433         Y1>>=19;\
 434         Y2>>=19;\
 435         U >>=19;\
 436         V >>=19;\
 437         if (alpha) {\
 438             A1 = 1<<18;\
 439             A2 = 1<<18;\
 440             for (j=0; j<lumFilterSize; j++) {\
 441                 A1 += alpSrc[j][i2  ] * lumFilter[j];\
 442                 A2 += alpSrc[j][i2+1] * lumFilter[j];\
 443             }\
 444             A1>>=19;\
 445             A2>>=19;\
 446         }
 447
 448 #define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
 449         YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha)\
 450         if ((Y1|Y2|U|V)&256) {\
 451             if (Y1>255)   Y1=255; \
 452             else if (Y1<0)Y1=0;   \
 453             if (Y2>255)   Y2=255; \
 454             else if (Y2<0)Y2=0;   \
 455             if (U>255)    U=255;  \
 456             else if (U<0) U=0;    \
 457             if (V>255)    V=255;  \
 458             else if (V<0) V=0;    \
 459         }\
 460         if (alpha && ((A1|A2)&256)) {\
 461             A1=av_clip_uint8(A1);\
 462             A2=av_clip_uint8(A2);\
 463         }
 464
 465 #define YSCALE_YUV_2_PACKEDX_FULL_C(rnd,alpha) \
 466     for (i=0; i<dstW; i++) {\
 467         int j;\
 468         int Y = 0;\
 469         int U = -128<<19;\
 470         int V = -128<<19;\
 471         int av_unused A;\
 472         int R,G,B;\
 473         \
 474         for (j=0; j<lumFilterSize; j++) {\
 475             Y += lumSrc[j][i     ] * lumFilter[j];\
 476         }\
 477         for (j=0; j<chrFilterSize; j++) {\
 478             U += chrUSrc[j][i] * chrFilter[j];\
 479             V += chrVSrc[j][i] * chrFilter[j];\
 480         }\
 481         Y >>=10;\
 482         U >>=10;\
 483         V >>=10;\
 484         if (alpha) {\
 485             A = rnd;\
 486             for (j=0; j<lumFilterSize; j++)\
 487                 A += alpSrc[j][i     ] * lumFilter[j];\
 488             A >>=19;\
 489             if (A&256)\
 490                 A = av_clip_uint8(A);\
 491         }
 492
 493 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
 494     YSCALE_YUV_2_PACKEDX_FULL_C(rnd>>3,alpha)\
 495         Y-= c->yuv2rgb_y_offset;\
 496         Y*= c->yuv2rgb_y_coeff;\
 497         Y+= rnd;\
 498         R= Y + V*c->yuv2rgb_v2r_coeff;\
 499         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
 500         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
 501         if ((R|G|B)&(0xC0000000)) {\
 502             if (R>=(256<<22))   R=(256<<22)-1; \
 503             else if (R<0)R=0;   \
 504             if (G>=(256<<22))   G=(256<<22)-1; \
 505             else if (G<0)G=0;   \
 506             if (B>=(256<<22))   B=(256<<22)-1; \
 507             else if (B<0)B=0;   \
 508         }
 509
 510 #define YSCALE_YUV_2_GRAY16_C \
 511     for (i=0; i<(dstW>>1); i++) {\
 512         int j;\
 513         int Y1 = 1<<18;\
 514         int Y2 = 1<<18;\
 515         int U  = 1<<18;\
 516         int V  = 1<<18;\
 517         \
 518         const int i2= 2*i;\
 519         \
 520         for (j=0; j<lumFilterSize; j++) {\
 521             Y1 += lumSrc[j][i2] * lumFilter[j];\
 522             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 523         }\
 524         Y1>>=11;\
 525         Y2>>=11;\
 526         if ((Y1|Y2|U|V)&65536) {\
 527             if (Y1>65535)   Y1=65535; \
 528             else if (Y1<0)Y1=0;   \
 529             if (Y2>65535)   Y2=65535; \
 530             else if (Y2<0)Y2=0;   \
 531         }
 532
 533 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
 534     YSCALE_YUV_2_PACKEDX_C(type,alpha)  /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
 535     r = (type *)c->table_rV[V];   \
 536     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
 537     b = (type *)c->table_bU[U];
 538
 539 #define YSCALE_YUV_2_PACKED2_C(type,alpha)   \
 540     for (i=0; i<(dstW>>1); i++) { \
 541         const int i2= 2*i;       \
 542         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>19;           \
 543         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;           \
 544         int U= (ubuf0[i]*uvalpha1+ubuf1[i]*uvalpha)>>19;              \
 545         int V= (vbuf0[i]*uvalpha1+vbuf1[i]*uvalpha)>>19;              \
 546         type av_unused *r, *b, *g;                                    \
 547         int av_unused A1, A2;                                         \
 548         if (alpha) {\
 549             A1= (abuf0[i2  ]*yalpha1+abuf1[i2  ]*yalpha)>>19;         \
 550             A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
 551         }
 552
 553 #define YSCALE_YUV_2_GRAY16_2_C   \
 554     for (i=0; i<(dstW>>1); i++) { \
 555         const int i2= 2*i;       \
 556         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>11;           \
 557         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>11;
 558
 559 #define YSCALE_YUV_2_RGB2_C(type,alpha) \
 560     YSCALE_YUV_2_PACKED2_C(type,alpha)\
 561     r = (type *)c->table_rV[V];\
 562     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 563     b = (type *)c->table_bU[U];
 564
 565 #define YSCALE_YUV_2_PACKED1_C(type,alpha) \
 566     for (i=0; i<(dstW>>1); i++) {\
 567         const int i2= 2*i;\
 568         int Y1= buf0[i2  ]>>7;\
 569         int Y2= buf0[i2+1]>>7;\
 570         int U= (ubuf1[i])>>7;\
 571         int V= (vbuf1[i])>>7;\
 572         type av_unused *r, *b, *g;\
 573         int av_unused A1, A2;\
 574         if (alpha) {\
 575             A1= abuf0[i2  ]>>7;\
 576             A2= abuf0[i2+1]>>7;\
 577         }
 578
 579 #define YSCALE_YUV_2_GRAY16_1_C \
 580     for (i=0; i<(dstW>>1); i++) {\
 581         const int i2= 2*i;\
 582         int Y1= buf0[i2  ]<<1;\
 583         int Y2= buf0[i2+1]<<1;
 584
 585 #define YSCALE_YUV_2_RGB1_C(type,alpha) \
 586     YSCALE_YUV_2_PACKED1_C(type,alpha)\
 587     r = (type *)c->table_rV[V];\
 588     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 589     b = (type *)c->table_bU[U];
 590
 591 #define YSCALE_YUV_2_PACKED1B_C(type,alpha) \
 592     for (i=0; i<(dstW>>1); i++) {\
 593         const int i2= 2*i;\
 594         int Y1= buf0[i2  ]>>7;\
 595         int Y2= buf0[i2+1]>>7;\
 596         int U= (ubuf0[i] + ubuf1[i])>>8;\
 597         int V= (vbuf0[i] + vbuf1[i])>>8;\
 598         type av_unused *r, *b, *g;\
 599         int av_unused A1, A2;\
 600         if (alpha) {\
 601             A1= abuf0[i2  ]>>7;\
 602             A2= abuf0[i2+1]>>7;\
 603         }
 604
 605 #define YSCALE_YUV_2_RGB1B_C(type,alpha) \
 606     YSCALE_YUV_2_PACKED1B_C(type,alpha)\
 607     r = (type *)c->table_rV[V];\
 608     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 609     b = (type *)c->table_bU[U];
 610
 611 #define YSCALE_YUV_2_MONO2_C \
 612     const uint8_t * const d128=dither_8x8_220[y&7];\
 613     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 614     for (i=0; i<dstW-7; i+=8) {\
 615         int acc;\
 616         acc =       g[((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19) + d128[0]];\
 617         acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
 618         acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
 619         acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
 620         acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
 621         acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
 622         acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
 623         acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
 624         ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 625         dest++;\
 626     }
 627
 628 #define YSCALE_YUV_2_MONOX_C \
 629     const uint8_t * const d128=dither_8x8_220[y&7];\
 630     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 631     int acc=0;\
 632     for (i=0; i<dstW-1; i+=2) {\
 633         int j;\
 634         int Y1=1<<18;\
 635         int Y2=1<<18;\
 636 \
 637         for (j=0; j<lumFilterSize; j++) {\
 638             Y1 += lumSrc[j][i] * lumFilter[j];\
 639             Y2 += lumSrc[j][i+1] * lumFilter[j];\
 640         }\
 641         Y1>>=19;\
 642         Y2>>=19;\
 643         if ((Y1|Y2)&256) {\
 644             if (Y1>255)   Y1=255;\
 645             else if (Y1<0)Y1=0;\
 646             if (Y2>255)   Y2=255;\
 647             else if (Y2<0)Y2=0;\
 648         }\
 649         acc+= acc + g[Y1+d128[(i+0)&7]];\
 650         acc+= acc + g[Y2+d128[(i+1)&7]];\
 651         if ((i&7)==6) {\
 652             ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 653             dest++;\
 654         }\
 655     }
 656
 657 #define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
 658     switch(c->dstFormat) {\
 659     case PIX_FMT_RGB48BE:\
 660     case PIX_FMT_RGB48LE:\
 661         func(uint8_t,0)\
 662             ((uint8_t*)dest)[ 0]= r[Y1];\
 663             ((uint8_t*)dest)[ 1]= r[Y1];\
 664             ((uint8_t*)dest)[ 2]= g[Y1];\
 665             ((uint8_t*)dest)[ 3]= g[Y1];\
 666             ((uint8_t*)dest)[ 4]= b[Y1];\
 667             ((uint8_t*)dest)[ 5]= b[Y1];\
 668             ((uint8_t*)dest)[ 6]= r[Y2];\
 669             ((uint8_t*)dest)[ 7]= r[Y2];\
 670             ((uint8_t*)dest)[ 8]= g[Y2];\
 671             ((uint8_t*)dest)[ 9]= g[Y2];\
 672             ((uint8_t*)dest)[10]= b[Y2];\
 673             ((uint8_t*)dest)[11]= b[Y2];\
 674             dest+=12;\
 675         }\
 676         break;\
 677     case PIX_FMT_BGR48BE:\
 678     case PIX_FMT_BGR48LE:\
 679         func(uint8_t,0)\
 680             ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
 681             ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
 682             ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
 683             ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
 684             ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
 685             ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
 686             dest+=12;\
 687         }\
 688         break;\
 689     case PIX_FMT_RGBA:\
 690     case PIX_FMT_BGRA:\
 691         if (CONFIG_SMALL) {\
 692             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 693             func(uint32_t,needAlpha)\
 694                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? (A1<<24) : 0);\
 695                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? (A2<<24) : 0);\
 696             }\
 697         } else {\
 698             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 699                 func(uint32_t,1)\
 700                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
 701                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
 702                 }\
 703             } else {\
 704                 func(uint32_t,0)\
 705                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 706                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 707                 }\
 708             }\
 709         }\
 710         break;\
 711     case PIX_FMT_ARGB:\
 712     case PIX_FMT_ABGR:\
 713         if (CONFIG_SMALL) {\
 714             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 715             func(uint32_t,needAlpha)\
 716                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? A1 : 0);\
 717                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? A2 : 0);\
 718             }\
 719         } else {\
 720             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 721                 func(uint32_t,1)\
 722                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + A1;\
 723                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + A2;\
 724                 }\
 725             } else {\
 726                 func(uint32_t,0)\
 727                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 728                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 729                 }\
 730             }\
 731         }                \
 732         break;\
 733     case PIX_FMT_RGB24:\
 734         func(uint8_t,0)\
 735             ((uint8_t*)dest)[0]= r[Y1];\
 736             ((uint8_t*)dest)[1]= g[Y1];\
 737             ((uint8_t*)dest)[2]= b[Y1];\
 738             ((uint8_t*)dest)[3]= r[Y2];\
 739             ((uint8_t*)dest)[4]= g[Y2];\
 740             ((uint8_t*)dest)[5]= b[Y2];\
 741             dest+=6;\
 742         }\
 743         break;\
 744     case PIX_FMT_BGR24:\
 745         func(uint8_t,0)\
 746             ((uint8_t*)dest)[0]= b[Y1];\
 747             ((uint8_t*)dest)[1]= g[Y1];\
 748             ((uint8_t*)dest)[2]= r[Y1];\
 749             ((uint8_t*)dest)[3]= b[Y2];\
 750             ((uint8_t*)dest)[4]= g[Y2];\
 751             ((uint8_t*)dest)[5]= r[Y2];\
 752             dest+=6;\
 753         }\
 754         break;\
 755     case PIX_FMT_RGB565BE:\
 756     case PIX_FMT_RGB565LE:\
 757     case PIX_FMT_BGR565BE:\
 758     case PIX_FMT_BGR565LE:\
 759         {\
 760             const int dr1= dither_2x2_8[y&1    ][0];\
 761             const int dg1= dither_2x2_4[y&1    ][0];\
 762             const int db1= dither_2x2_8[(y&1)^1][0];\
 763             const int dr2= dither_2x2_8[y&1    ][1];\
 764             const int dg2= dither_2x2_4[y&1    ][1];\
 765             const int db2= dither_2x2_8[(y&1)^1][1];\
 766             func(uint16_t,0)\
 767                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 768                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 769             }\
 770         }\
 771         break;\
 772     case PIX_FMT_RGB555BE:\
 773     case PIX_FMT_RGB555LE:\
 774     case PIX_FMT_BGR555BE:\
 775     case PIX_FMT_BGR555LE:\
 776         {\
 777             const int dr1= dither_2x2_8[y&1    ][0];\
 778             const int dg1= dither_2x2_8[y&1    ][1];\
 779             const int db1= dither_2x2_8[(y&1)^1][0];\
 780             const int dr2= dither_2x2_8[y&1    ][1];\
 781             const int dg2= dither_2x2_8[y&1    ][0];\
 782             const int db2= dither_2x2_8[(y&1)^1][1];\
 783             func(uint16_t,0)\
 784                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 785                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 786             }\
 787         }\
 788         break;\
 789     case PIX_FMT_RGB444BE:\
 790     case PIX_FMT_RGB444LE:\
 791     case PIX_FMT_BGR444BE:\
 792     case PIX_FMT_BGR444LE:\
 793         {\
 794             const int dr1= dither_4x4_16[y&3    ][0];\
 795             const int dg1= dither_4x4_16[y&3    ][1];\
 796             const int db1= dither_4x4_16[(y&3)^3][0];\
 797             const int dr2= dither_4x4_16[y&3    ][1];\
 798             const int dg2= dither_4x4_16[y&3    ][0];\
 799             const int db2= dither_4x4_16[(y&3)^3][1];\
 800             func(uint16_t,0)\
 801                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 802                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 803             }\
 804         }\
 805         break;\
 806     case PIX_FMT_RGB8:\
 807     case PIX_FMT_BGR8:\
 808         {\
 809             const uint8_t * const d64= dither_8x8_73[y&7];\
 810             const uint8_t * const d32= dither_8x8_32[y&7];\
 811             func(uint8_t,0)\
 812                 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
 813                 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
 814             }\
 815         }\
 816         break;\
 817     case PIX_FMT_RGB4:\
 818     case PIX_FMT_BGR4:\
 819         {\
 820             const uint8_t * const d64= dither_8x8_73 [y&7];\
 821             const uint8_t * const d128=dither_8x8_220[y&7];\
 822             func(uint8_t,0)\
 823                 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
 824                                  + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
 825             }\
 826         }\
 827         break;\
 828     case PIX_FMT_RGB4_BYTE:\
 829     case PIX_FMT_BGR4_BYTE:\
 830         {\
 831             const uint8_t * const d64= dither_8x8_73 [y&7];\
 832             const uint8_t * const d128=dither_8x8_220[y&7];\
 833             func(uint8_t,0)\
 834                 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
 835                 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
 836             }\
 837         }\
 838         break;\
 839     case PIX_FMT_MONOBLACK:\
 840     case PIX_FMT_MONOWHITE:\
 841         {\
 842             func_monoblack\
 843         }\
 844         break;\
 845     case PIX_FMT_YUYV422:\
 846         func2\
 847             ((uint8_t*)dest)[2*i2+0]= Y1;\
 848             ((uint8_t*)dest)[2*i2+1]= U;\
 849             ((uint8_t*)dest)[2*i2+2]= Y2;\
 850             ((uint8_t*)dest)[2*i2+3]= V;\
 851         }                \
 852         break;\
 853     case PIX_FMT_UYVY422:\
 854         func2\
 855             ((uint8_t*)dest)[2*i2+0]= U;\
 856             ((uint8_t*)dest)[2*i2+1]= Y1;\
 857             ((uint8_t*)dest)[2*i2+2]= V;\
 858             ((uint8_t*)dest)[2*i2+3]= Y2;\
 859         }                \
 860         break;\
 861     case PIX_FMT_GRAY16BE:\
 862         func_g16\
 863             ((uint8_t*)dest)[2*i2+0]= Y1>>8;\
 864             ((uint8_t*)dest)[2*i2+1]= Y1;\
 865             ((uint8_t*)dest)[2*i2+2]= Y2>>8;\
 866             ((uint8_t*)dest)[2*i2+3]= Y2;\
 867         }                \
 868         break;\
 869     case PIX_FMT_GRAY16LE:\
 870         func_g16\
 871             ((uint8_t*)dest)[2*i2+0]= Y1;\
 872             ((uint8_t*)dest)[2*i2+1]= Y1>>8;\
 873             ((uint8_t*)dest)[2*i2+2]= Y2;\
 874             ((uint8_t*)dest)[2*i2+3]= Y2>>8;\
 875         }                \
 876         break;\
 877     }
 878
 879 static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
 880                           const int16_t **lumSrc, int lumFilterSize,
 881                           const int16_t *chrFilter, const int16_t **chrUSrc,
 882                           const int16_t **chrVSrc, int chrFilterSize,
 883                           const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 884 {
 885     int i;
 886     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
 887 }
 888
 889 static inline void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
 890                                    const int16_t **lumSrc, int lumFilterSize,
 891                                    const int16_t *chrFilter, const int16_t **chrUSrc,
 892                                    const int16_t **chrVSrc, int chrFilterSize,
 893                                    const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 894 {
 895     int i;
 896     int step= c->dstFormatBpp/8;
 897     int aidx= 3;
 898
 899     switch(c->dstFormat) {
 900     case PIX_FMT_ARGB:
 901         dest++;
 902         aidx= 0;
 903     case PIX_FMT_RGB24:
 904         aidx--;
 905     case PIX_FMT_RGBA:
 906         if (CONFIG_SMALL) {
 907             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 908             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 909                 dest[aidx]= needAlpha ? A : 255;
 910                 dest[0]= R>>22;
 911                 dest[1]= G>>22;
 912                 dest[2]= B>>22;
 913                 dest+= step;
 914             }
 915         } else {
 916             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 917                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 918                     dest[aidx]= A;
 919                     dest[0]= R>>22;
 920                     dest[1]= G>>22;
 921                     dest[2]= B>>22;
 922                     dest+= step;
 923                 }
 924             } else {
 925                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 926                     dest[aidx]= 255;
 927                     dest[0]= R>>22;
 928                     dest[1]= G>>22;
 929                     dest[2]= B>>22;
 930                     dest+= step;
 931                 }
 932             }
 933         }
 934         break;
 935     case PIX_FMT_ABGR:
 936         dest++;
 937         aidx= 0;
 938     case PIX_FMT_BGR24:
 939         aidx--;
 940     case PIX_FMT_BGRA:
 941         if (CONFIG_SMALL) {
 942             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 943             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 944                 dest[aidx]= needAlpha ? A : 255;
 945                 dest[0]= B>>22;
 946                 dest[1]= G>>22;
 947                 dest[2]= R>>22;
 948                 dest+= step;
 949             }
 950         } else {
 951             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 952                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 953                     dest[aidx]= A;
 954                     dest[0]= B>>22;
 955                     dest[1]= G>>22;
 956                     dest[2]= R>>22;
 957                     dest+= step;
 958                 }
 959             } else {
 960                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 961                     dest[aidx]= 255;
 962                     dest[0]= B>>22;
 963                     dest[1]= G>>22;
 964                     dest[2]= R>>22;
 965                     dest+= step;
 966                 }
 967             }
 968         }
 969         break;
 970     default:
 971         assert(0);
 972     }
 973 }
 974
 975 static void fillPlane(uint8_t* plane, int stride, int width, int height, int y, uint8_t val)
 976 {
 977     int i;
 978     uint8_t *ptr = plane + stride*y;
 979     for (i=0; i<height; i++) {
 980         memset(ptr, val, width);
 981         ptr += stride;
 982     }
 983 }
 984
 985 #define rgb48funcs(LE_BE, rfunc, compA, compB, compC) \
 986 static void compA ## compB ## compC ## 48 ## LE_BE ## ToY_c( \
 987                        uint8_t *dst, const uint8_t *src, int width, \
 988                        uint32_t *unused) \
 989 { \
 990     int i; \
 991     for (i = 0; i < width; i++) { \
 992         int compA = rfunc(&src[i*6+0]) >> 8; \
 993         int compB = rfunc(&src[i*6+2]) >> 8; \
 994         int compC = rfunc(&src[i*6+4]) >> 8; \
 995  \
 996         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
 997     } \
 998 } \
 999  \
1000 static void compA ## compB ## compC ## 48 ## LE_BE ## ToUV_c( \
1001                         uint8_t *dstU, uint8_t *dstV, \
1002                         const uint8_t *src1, const uint8_t *src2, \
1003                         int width, uint32_t *unused) \
1004 { \
1005     int i; \
1006     assert(src1==src2); \
1007     for (i = 0; i < width; i++) { \
1008         int compA = rfunc(&src1[6*i + 0]) >> 8; \
1009         int compB = rfunc(&src1[6*i + 2]) >> 8; \
1010         int compC = rfunc(&src1[6*i + 4]) >> 8; \
1011  \
1012         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
1013         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
1014     } \
1015 } \
1016  \
1017 static void compA ## compB ## compC ## 48 ## LE_BE ## ToUV_half_c( \
1018                             uint8_t *dstU, uint8_t *dstV, \
1019                             const uint8_t *src1, const uint8_t *src2, \
1020                             int width, uint32_t *unused) \
1021 { \
1022     int i; \
1023     assert(src1==src2); \
1024     for (i = 0; i < width; i++) { \
1025         int compA = (rfunc(&src1[12*i + 0]) >> 8) + (rfunc(&src1[12*i + 6]) >> 8); \
1026         int compB = (rfunc(&src1[12*i + 2]) >> 8) + (rfunc(&src1[12*i + 8]) >> 8); \
1027         int compC = (rfunc(&src1[12*i + 4]) >> 8) + (rfunc(&src1[12*i + 10]) >> 8); \
1028  \
1029         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1); \
1030         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1); \
1031     } \
1032 }
1033 rgb48funcs(LE, AV_RL16, r, g, b);
1034 rgb48funcs(BE, AV_RB16, r, g, b);
1035 rgb48funcs(LE, AV_RL16, b, g, r);
1036 rgb48funcs(BE, AV_RB16, b, g, r);
1037
1038 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1039 static void name ## _c(uint8_t *dst, const uint8_t *src, \
1040                        int width, uint32_t *unused)\
1041 {\
1042     int i;\
1043     for (i=0; i<width; i++) {\
1044         int b= (((const type*)src)[i]>>shb)&maskb;\
1045         int g= (((const type*)src)[i]>>shg)&maskg;\
1046         int r= (((const type*)src)[i]>>shr)&maskr;\
1047 \
1048         dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1049     }\
1050 }
1051
1052 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1053 BGR2Y(uint32_t,bgr321ToY,16,16, 0, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1054 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1055 BGR2Y(uint32_t,rgb321ToY, 0,16,16, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1056 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1057 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1058 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1059 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1060
1061 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1062 {
1063     int i;
1064     for (i=0; i<width; i++) {
1065         dst[i]= src[4*i];
1066     }
1067 }
1068
1069 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1070 {
1071     int i;
1072     for (i=0; i<width; i++) {
1073         dst[i]= src[4*i+3];
1074     }
1075 }
1076
1077 #define BGR2UV(type, name, shr, shg, shb, shp, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S) \
1078 static void name ## _c(uint8_t *dstU, uint8_t *dstV, \
1079                        const uint8_t *src, const uint8_t *dummy, \
1080                        int width, uint32_t *unused)\
1081 {\
1082     int i;\
1083     for (i=0; i<width; i++) {\
1084         int b= ((((const type*)src)[i]>>shp)&maskb)>>shb;\
1085         int g= ((((const type*)src)[i]>>shp)&maskg)>>shg;\
1086         int r= ((((const type*)src)[i]>>shp)&maskr)>>shr;\
1087 \
1088         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1089         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1090     }\
1091 }\
1092 static void name ## _half_c(uint8_t *dstU, uint8_t *dstV, \
1093                             const uint8_t *src, const uint8_t *dummy, \
1094                             int width, uint32_t *unused)\
1095 {\
1096     int i;\
1097     for (i=0; i<width; i++) {\
1098         int pix0= ((const type*)src)[2*i+0]>>shp;\
1099         int pix1= ((const type*)src)[2*i+1]>>shp;\
1100         int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1101         int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1102         int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1103         g&= maskg|(2*maskg);\
1104 \
1105         g>>=shg;\
1106 \
1107         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1108         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1109     }\
1110 }
1111
1112 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1113 BGR2UV(uint32_t,bgr321ToUV,16, 0, 0, 8, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1114 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1115 BGR2UV(uint32_t,rgb321ToUV, 0, 0,16, 8,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1116 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1117 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1118 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1119 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1120
1121 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1122 {
1123     int i;
1124     for (i=0; i<width; i++) {
1125         int d= src[i];
1126
1127         dst[i]= pal[d] & 0xFF;
1128     }
1129 }
1130
1131 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1132                       const uint8_t *src1, const uint8_t *src2,
1133                       int width, uint32_t *pal)
1134 {
1135     int i;
1136     assert(src1 == src2);
1137     for (i=0; i<width; i++) {
1138         int p= pal[src1[i]];
1139
1140         dstU[i]= p>>8;
1141         dstV[i]= p>>16;
1142     }
1143 }
1144
1145 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1146                           int width, uint32_t *unused)
1147 {
1148     int i, j;
1149     for (i=0; i<width/8; i++) {
1150         int d= ~src[i];
1151         for(j=0; j<8; j++)
1152             dst[8*i+j]= ((d>>(7-j))&1)*255;
1153     }
1154 }
1155
1156 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1157                           int width, uint32_t *unused)
1158 {
1159     int i, j;
1160     for (i=0; i<width/8; i++) {
1161         int d= src[i];
1162         for(j=0; j<8; j++)
1163             dst[8*i+j]= ((d>>(7-j))&1)*255;
1164     }
1165 }
1166
1167 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
1168                        const int16_t *chrUSrc, const int16_t *chrVSrc,
1169                        const int16_t *alpSrc,
1170                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
1171                        uint8_t *aDest, int dstW, int chrDstW)
1172 {
1173     int i;
1174     for (i=0; i<dstW; i++) {
1175         int val= (lumSrc[i]+64)>>7;
1176         dest[i]= av_clip_uint8(val);
1177     }
1178
1179     if (uDest)
1180         for (i=0; i<chrDstW; i++) {
1181             int u=(chrUSrc[i]+64)>>7;
1182             int v=(chrVSrc[i]+64)>>7;
1183             uDest[i]= av_clip_uint8(u);
1184             vDest[i]= av_clip_uint8(v);
1185         }
1186
1187     if (CONFIG_SWSCALE_ALPHA && aDest)
1188         for (i=0; i<dstW; i++) {
1189             int val= (alpSrc[i]+64)>>7;
1190             aDest[i]= av_clip_uint8(val);
1191         }
1192 }
1193
1194 /**
1195  * vertical bilinear scale YV12 to RGB
1196  */
1197 static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
1198                           const uint16_t *buf1, const uint16_t *ubuf0,
1199                           const uint16_t *ubuf1, const uint16_t *vbuf0,
1200                           const uint16_t *vbuf1, const uint16_t *abuf0,
1201                           const uint16_t *abuf1, uint8_t *dest, int dstW,
1202                           int yalpha, int uvalpha, int y)
1203 {
1204     int  yalpha1=4095- yalpha;
1205     int uvalpha1=4095-uvalpha;
1206     int i;
1207
1208     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1209 }
1210
1211 /**
1212  * YV12 to RGB without scaling or interpolating
1213  */
1214 static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
1215                           const uint16_t *ubuf0, const uint16_t *ubuf1,
1216                           const uint16_t *vbuf0, const uint16_t *vbuf1,
1217                           const uint16_t *abuf0, uint8_t *dest, int dstW,
1218                           int uvalpha, enum PixelFormat dstFormat,
1219                           int flags, int y)
1220 {
1221     const int yalpha1=0;
1222     int i;
1223
1224     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1225     const int yalpha= 4096; //FIXME ...
1226
1227     if (uvalpha < 2048) {
1228         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1229     } else {
1230         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1231     }
1232 }
1233
1234 //FIXME yuy2* can read up to 7 samples too much
1235
1236 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1237                       uint32_t *unused)
1238 {
1239     int i;
1240     for (i=0; i<width; i++)
1241         dst[i]= src[2*i];
1242 }
1243
1244 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1245                        const uint8_t *src2, int width, uint32_t *unused)
1246 {
1247     int i;
1248     for (i=0; i<width; i++) {
1249         dstU[i]= src1[4*i + 1];
1250         dstV[i]= src1[4*i + 3];
1251     }
1252     assert(src1 == src2);
1253 }
1254
1255 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1256                      const uint8_t *src2, int width, uint32_t *unused)
1257 {
1258     int i;
1259     for (i=0; i<width; i++) {
1260         dstU[i]= src1[2*i + 1];
1261         dstV[i]= src2[2*i + 1];
1262     }
1263 }
1264
1265 /* This is almost identical to the previous, end exists only because
1266  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1267 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1268                       uint32_t *unused)
1269 {
1270     int i;
1271     for (i=0; i<width; i++)
1272         dst[i]= src[2*i+1];
1273 }
1274
1275 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1276                        const uint8_t *src2, int width, uint32_t *unused)
1277 {
1278     int i;
1279     for (i=0; i<width; i++) {
1280         dstU[i]= src1[4*i + 0];
1281         dstV[i]= src1[4*i + 2];
1282     }
1283     assert(src1 == src2);
1284 }
1285
1286 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1287                      const uint8_t *src2, int width, uint32_t *unused)
1288 {
1289     int i;
1290     for (i=0; i<width; i++) {
1291         dstU[i]= src1[2*i];
1292         dstV[i]= src2[2*i];
1293     }
1294 }
1295
1296 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1297                                         const uint8_t *src, int width)
1298 {
1299     int i;
1300     for (i = 0; i < width; i++) {
1301         dst1[i] = src[2*i+0];
1302         dst2[i] = src[2*i+1];
1303     }
1304 }
1305
1306 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1307                        const uint8_t *src1, const uint8_t *src2,
1308                        int width, uint32_t *unused)
1309 {
1310     nvXXtoUV_c(dstU, dstV, src1, width);
1311 }
1312
1313 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1314                        const uint8_t *src1, const uint8_t *src2,
1315                        int width, uint32_t *unused)
1316 {
1317     nvXXtoUV_c(dstV, dstU, src1, width);
1318 }
1319
1320 // FIXME Maybe dither instead.
1321 #define YUV_NBPS(depth, endianness, rfunc) \
1322 static void endianness ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1323                                           const uint8_t *_srcU, const uint8_t *_srcV, \
1324                                           int width, uint32_t *unused) \
1325 { \
1326     int i; \
1327     const uint16_t *srcU = (const uint16_t*)_srcU; \
1328     const uint16_t *srcV = (const uint16_t*)_srcV; \
1329     for (i = 0; i < width; i++) { \
1330         dstU[i] = rfunc(&srcU[i])>>(depth-8); \
1331         dstV[i] = rfunc(&srcV[i])>>(depth-8); \
1332     } \
1333 } \
1334 \
1335 static void endianness ## depth ## ToY_c(uint8_t *dstY, const uint8_t *_srcY, \
1336                                          int width, uint32_t *unused) \
1337 { \
1338     int i; \
1339     const uint16_t *srcY = (const uint16_t*)_srcY; \
1340     for (i = 0; i < width; i++) \
1341         dstY[i] = rfunc(&srcY[i])>>(depth-8); \
1342 } \
1343
1344 YUV_NBPS( 9, LE, AV_RL16)
1345 YUV_NBPS( 9, BE, AV_RB16)
1346 YUV_NBPS(10, LE, AV_RL16)
1347 YUV_NBPS(10, BE, AV_RB16)
1348
1349 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1350                        int width, uint32_t *unused)
1351 {
1352     int i;
1353     for (i=0; i<width; i++) {
1354         int b= src[i*3+0];
1355         int g= src[i*3+1];
1356         int r= src[i*3+2];
1357
1358         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1359     }
1360 }
1361
1362 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1363                         const uint8_t *src2, int width, uint32_t *unused)
1364 {
1365     int i;
1366     for (i=0; i<width; i++) {
1367         int b= src1[3*i + 0];
1368         int g= src1[3*i + 1];
1369         int r= src1[3*i + 2];
1370
1371         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1372         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1373     }
1374     assert(src1 == src2);
1375 }
1376
1377 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1378                              const uint8_t *src2, int width, uint32_t *unused)
1379 {
1380     int i;
1381     for (i=0; i<width; i++) {
1382         int b= src1[6*i + 0] + src1[6*i + 3];
1383         int g= src1[6*i + 1] + src1[6*i + 4];
1384         int r= src1[6*i + 2] + src1[6*i + 5];
1385
1386         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1387         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1388     }
1389     assert(src1 == src2);
1390 }
1391
1392 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1393                        uint32_t *unused)
1394 {
1395     int i;
1396     for (i=0; i<width; i++) {
1397         int r= src[i*3+0];
1398         int g= src[i*3+1];
1399         int b= src[i*3+2];
1400
1401         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1402     }
1403 }
1404
1405 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1406                         const uint8_t *src2, int width, uint32_t *unused)
1407 {
1408     int i;
1409     assert(src1==src2);
1410     for (i=0; i<width; i++) {
1411         int r= src1[3*i + 0];
1412         int g= src1[3*i + 1];
1413         int b= src1[3*i + 2];
1414
1415         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1416         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1417     }
1418 }
1419
1420 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1421                              const uint8_t *src2, int width, uint32_t *unused)
1422 {
1423     int i;
1424     assert(src1==src2);
1425     for (i=0; i<width; i++) {
1426         int r= src1[6*i + 0] + src1[6*i + 3];
1427         int g= src1[6*i + 1] + src1[6*i + 4];
1428         int b= src1[6*i + 2] + src1[6*i + 5];
1429
1430         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1431         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1432     }
1433 }
1434
1435
1436 // bilinear / bicubic scaling
1437 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1438                      int srcW, int xInc,
1439                      const int16_t *filter, const int16_t *filterPos,
1440                      int filterSize)
1441 {
1442     int i;
1443     for (i=0; i<dstW; i++) {
1444         int j;
1445         int srcPos= filterPos[i];
1446         int val=0;
1447         for (j=0; j<filterSize; j++) {
1448             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1449         }
1450         //filter += hFilterSize;
1451         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1452         //dst[i] = val>>7;
1453     }
1454 }
1455
1456 //FIXME all pal and rgb srcFormats could do this convertion as well
1457 //FIXME all scalers more complex than bilinear could do half of this transform
1458 static void chrRangeToJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1459 {
1460     int i;
1461     for (i = 0; i < width; i++) {
1462         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1463         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1464     }
1465 }
1466 static void chrRangeFromJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1467 {
1468     int i;
1469     for (i = 0; i < width; i++) {
1470         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1471         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1472     }
1473 }
1474 static void lumRangeToJpeg_c(uint16_t *dst, int width)
1475 {
1476     int i;
1477     for (i = 0; i < width; i++)
1478         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1479 }
1480 static void lumRangeFromJpeg_c(uint16_t *dst, int width)
1481 {
1482     int i;
1483     for (i = 0; i < width; i++)
1484         dst[i] = (dst[i]*14071 + 33561947)>>14;
1485 }
1486
1487 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1488                            const uint8_t *src, int srcW, int xInc)
1489 {
1490     int i;
1491     unsigned int xpos=0;
1492     for (i=0;i<dstWidth;i++) {
1493         register unsigned int xx=xpos>>16;
1494         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1495         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1496         xpos+=xInc;
1497     }
1498 }
1499
1500 // *** horizontal scale Y line to temp buffer
1501 static inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1502                            const uint8_t *src, int srcW, int xInc,
1503                            const int16_t *hLumFilter,
1504                            const int16_t *hLumFilterPos, int hLumFilterSize,
1505                            uint8_t *formatConvBuffer,
1506                            uint32_t *pal, int isAlpha)
1507 {
1508     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1509     void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1510
1511     if (toYV12) {
1512         toYV12(formatConvBuffer, src, srcW, pal);
1513         src= formatConvBuffer;
1514     }
1515
1516     if (!c->hyscale_fast) {
1517         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1518     } else { // fast bilinear upscale / crap downscale
1519         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1520     }
1521
1522     if (convertRange)
1523         convertRange(dst, dstWidth);
1524 }
1525
1526 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1527                            int dstWidth, const uint8_t *src1,
1528                            const uint8_t *src2, int srcW, int xInc)
1529 {
1530     int i;
1531     unsigned int xpos=0;
1532     for (i=0;i<dstWidth;i++) {
1533         register unsigned int xx=xpos>>16;
1534         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1535         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1536         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1537         xpos+=xInc;
1538     }
1539 }
1540
1541 static inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1542                            const uint8_t *src1, const uint8_t *src2,
1543                            int srcW, int xInc, const int16_t *hChrFilter,
1544                            const int16_t *hChrFilterPos, int hChrFilterSize,
1545                            uint8_t *formatConvBuffer, uint32_t *pal)
1546 {
1547     if (c->chrToYV12) {
1548         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1549         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1550         src1= formatConvBuffer;
1551         src2= buf2;
1552     }
1553
1554     if (!c->hcscale_fast) {
1555         c->hScale(dst1, dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1556         c->hScale(dst2, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1557     } else { // fast bilinear upscale / crap downscale
1558         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1559     }
1560
1561     if (c->chrConvertRange)
1562         c->chrConvertRange(dst1, dst2, dstWidth);
1563 }
1564
1565 #define DEBUG_SWSCALE_BUFFERS 0
1566 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
1567
1568 static int swScale(SwsContext *c, const uint8_t* src[],
1569                    int srcStride[], int srcSliceY,
1570                    int srcSliceH, uint8_t* dst[], int dstStride[])
1571 {
1572     /* load a few things into local vars to make the code more readable? and faster */
1573     const int srcW= c->srcW;
1574     const int dstW= c->dstW;
1575     const int dstH= c->dstH;
1576     const int chrDstW= c->chrDstW;
1577     const int chrSrcW= c->chrSrcW;
1578     const int lumXInc= c->lumXInc;
1579     const int chrXInc= c->chrXInc;
1580     const enum PixelFormat dstFormat= c->dstFormat;
1581     const int flags= c->flags;
1582     int16_t *vLumFilterPos= c->vLumFilterPos;
1583     int16_t *vChrFilterPos= c->vChrFilterPos;
1584     int16_t *hLumFilterPos= c->hLumFilterPos;
1585     int16_t *hChrFilterPos= c->hChrFilterPos;
1586     int16_t *vLumFilter= c->vLumFilter;
1587     int16_t *vChrFilter= c->vChrFilter;
1588     int16_t *hLumFilter= c->hLumFilter;
1589     int16_t *hChrFilter= c->hChrFilter;
1590     int32_t *lumMmxFilter= c->lumMmxFilter;
1591     int32_t *chrMmxFilter= c->chrMmxFilter;
1592     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
1593     const int vLumFilterSize= c->vLumFilterSize;
1594     const int vChrFilterSize= c->vChrFilterSize;
1595     const int hLumFilterSize= c->hLumFilterSize;
1596     const int hChrFilterSize= c->hChrFilterSize;
1597     int16_t **lumPixBuf= c->lumPixBuf;
1598     int16_t **chrUPixBuf= c->chrUPixBuf;
1599     int16_t **chrVPixBuf= c->chrVPixBuf;
1600     int16_t **alpPixBuf= c->alpPixBuf;
1601     const int vLumBufSize= c->vLumBufSize;
1602     const int vChrBufSize= c->vChrBufSize;
1603     uint8_t *formatConvBuffer= c->formatConvBuffer;
1604     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
1605     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
1606     int lastDstY;
1607     uint32_t *pal=c->pal_yuv;
1608
1609     /* vars which will change and which we need to store back in the context */
1610     int dstY= c->dstY;
1611     int lumBufIndex= c->lumBufIndex;
1612     int chrBufIndex= c->chrBufIndex;
1613     int lastInLumBuf= c->lastInLumBuf;
1614     int lastInChrBuf= c->lastInChrBuf;
1615
1616     if (isPacked(c->srcFormat)) {
1617         src[0]=
1618         src[1]=
1619         src[2]=
1620         src[3]= src[0];
1621         srcStride[0]=
1622         srcStride[1]=
1623         srcStride[2]=
1624         srcStride[3]= srcStride[0];
1625     }
1626     srcStride[1]<<= c->vChrDrop;
1627     srcStride[2]<<= c->vChrDrop;
1628
1629     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
1630                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
1631                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
1632     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
1633                    srcSliceY,    srcSliceH,    dstY,    dstH);
1634     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
1635                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
1636
1637     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
1638         static int warnedAlready=0; //FIXME move this into the context perhaps
1639         if (flags & SWS_PRINT_INFO && !warnedAlready) {
1640             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
1641                    "         ->cannot do aligned memory accesses anymore\n");
1642             warnedAlready=1;
1643         }
1644     }
1645
1646     /* Note the user might start scaling the picture in the middle so this
1647        will not get executed. This is not really intended but works
1648        currently, so people might do it. */
1649     if (srcSliceY ==0) {
1650         lumBufIndex=-1;
1651         chrBufIndex=-1;
1652         dstY=0;
1653         lastInLumBuf= -1;
1654         lastInChrBuf= -1;
1655     }
1656
1657     lastDstY= dstY;
1658
1659     for (;dstY < dstH; dstY++) {
1660         unsigned char *dest =dst[0]+dstStride[0]*dstY;
1661         const int chrDstY= dstY>>c->chrDstVSubSample;
1662         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
1663         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
1664         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
1665
1666         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
1667         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
1668         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
1669         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
1670         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
1671         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
1672         int enough_lines;
1673
1674         //handle holes (FAST_BILINEAR & weird filters)
1675         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1676         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1677         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
1678         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
1679
1680         DEBUG_BUFFERS("dstY: %d\n", dstY);
1681         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
1682                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
1683         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
1684                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
1685
1686         // Do we have enough lines in this slice to output the dstY line
1687         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
1688
1689         if (!enough_lines) {
1690             lastLumSrcY = srcSliceY + srcSliceH - 1;
1691             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
1692             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
1693                                             lastLumSrcY, lastChrSrcY);
1694         }
1695
1696         //Do horizontal scaling
1697         while(lastInLumBuf < lastLumSrcY) {
1698             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
1699             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
1700             lumBufIndex++;
1701             assert(lumBufIndex < 2*vLumBufSize);
1702             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
1703             assert(lastInLumBuf + 1 - srcSliceY >= 0);
1704             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
1705                     hLumFilter, hLumFilterPos, hLumFilterSize,
1706                     formatConvBuffer,
1707                     pal, 0);
1708             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
1709                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
1710                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
1711                         formatConvBuffer,
1712                         pal, 1);
1713             lastInLumBuf++;
1714             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
1715                                lumBufIndex,    lastInLumBuf);
1716         }
1717         while(lastInChrBuf < lastChrSrcY) {
1718             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
1719             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
1720             chrBufIndex++;
1721             assert(chrBufIndex < 2*vChrBufSize);
1722             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
1723             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
1724             //FIXME replace parameters through context struct (some at least)
1725
1726             if (c->needs_hcscale)
1727                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
1728                           chrDstW, src1, src2, chrSrcW, chrXInc,
1729                           hChrFilter, hChrFilterPos, hChrFilterSize,
1730                           formatConvBuffer, pal);
1731             lastInChrBuf++;
1732             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
1733                                chrBufIndex,    lastInChrBuf);
1734         }
1735         //wrap buf index around to stay inside the ring buffer
1736         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
1737         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
1738         if (!enough_lines)
1739             break; //we can't output a dstY line so let's try with the next slice
1740
1741 #if HAVE_MMX
1742         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
1743 #endif
1744         if (dstY < dstH-2) {
1745             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1746             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1747             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1748             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1749             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
1750                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1751                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1752                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
1753                     const int16_t *lumBuf = lumSrcPtr[0];
1754                     const int16_t *chrUBuf= chrUSrcPtr[0];
1755                     const int16_t *chrVBuf= chrVSrcPtr[0];
1756                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
1757                     c->yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
1758                                 uDest, vDest, aDest, dstW, chrDstW);
1759                 } else { //General YV12
1760                     c->yuv2yuvX(c,
1761                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1762                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
1763                                 chrVSrcPtr, vChrFilterSize,
1764                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
1765                 }
1766             } else {
1767                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
1768                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1769                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
1770                     int chrAlpha= vChrFilter[2*dstY+1];
1771                     c->yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
1772                                    *chrVSrcPtr, *(chrVSrcPtr+1),
1773                                    alpPixBuf ? *alpSrcPtr : NULL,
1774                                    dest, dstW, chrAlpha, dstFormat, flags, dstY);
1775                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
1776                     int lumAlpha= vLumFilter[2*dstY+1];
1777                     int chrAlpha= vChrFilter[2*dstY+1];
1778                     lumMmxFilter[2]=
1779                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
1780                     chrMmxFilter[2]=
1781                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
1782                     c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
1783                                    *chrVSrcPtr, *(chrVSrcPtr+1),
1784                                    alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
1785                                    dest, dstW, lumAlpha, chrAlpha, dstY);
1786                 } else { //general RGB
1787                     c->yuv2packedX(c,
1788                                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1789                                    vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1790                                    alpSrcPtr, dest, dstW, dstY);
1791                 }
1792             }
1793         } else { // hmm looks like we can't use MMX here without overwriting this array's tail
1794             const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1795             const int16_t **chrUSrcPtr= (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1796             const int16_t **chrVSrcPtr= (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1797             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1798             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1799                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1800                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
1801                 yuv2nv12X_c(c, vLumFilter+dstY*vLumFilterSize,
1802                             lumSrcPtr, vLumFilterSize,
1803                             vChrFilter+chrDstY*vChrFilterSize,
1804                             chrUSrcPtr, chrVSrcPtr, vChrFilterSize, NULL,
1805                             dest, uDest, NULL, NULL, dstW, chrDstW);
1806             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
1807                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1808                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1809                 if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
1810                     yuv2yuvX16_c(c, vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1811                                  vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1812                                  alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW,
1813                                  dstFormat);
1814                 } else {
1815                     yuv2yuvX_c(c, vLumFilter+dstY*vLumFilterSize,
1816                                lumSrcPtr, vLumFilterSize,
1817                                vChrFilter+chrDstY*vChrFilterSize,
1818                                chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1819                                alpSrcPtr, dest, uDest, vDest, aDest,
1820                                dstW, chrDstW);
1821                 }
1822             } else {
1823                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
1824                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1825                 if(flags & SWS_FULL_CHR_H_INT) {
1826                     yuv2rgbX_c_full(c,
1827                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1828                                     vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1829                                     alpSrcPtr, dest, dstW, dstY);
1830                 } else {
1831                     yuv2packedX_c(c,
1832                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1833                                   vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1834                                   alpSrcPtr, dest, dstW, dstY);
1835                 }
1836             }
1837         }
1838     }
1839
1840     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
1841         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
1842
1843 #if HAVE_MMX2
1844     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
1845         __asm__ volatile("sfence":::"memory");
1846 #endif
1847     emms_c();
1848
1849     /* store changed local vars back in the context */
1850     c->dstY= dstY;
1851     c->lumBufIndex= lumBufIndex;
1852     c->chrBufIndex= chrBufIndex;
1853     c->lastInLumBuf= lastInLumBuf;
1854     c->lastInChrBuf= lastInChrBuf;
1855
1856     return dstY - lastDstY;
1857 }
1858
1859 static void sws_init_swScale_c(SwsContext *c)
1860 {
1861     enum PixelFormat srcFormat = c->srcFormat,
1862                      dstFormat = c->dstFormat;
1863
1864     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1865         c->yuv2yuvX     = yuv2nv12X_c;
1866     } else if (is16BPS(dstFormat)) {
1867         c->yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
1868     } else if (is9_OR_10BPS(dstFormat)) {
1869         if (dstFormat == PIX_FMT_YUV420P9BE || dstFormat == PIX_FMT_YUV420P9LE) {
1870             c->yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
1871         } else {
1872             c->yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
1873         }
1874     } else {
1875         c->yuv2yuv1     = yuv2yuv1_c;
1876         c->yuv2yuvX     = yuv2yuvX_c;
1877     }
1878     if(c->flags & SWS_FULL_CHR_H_INT) {
1879         c->yuv2packedX = yuv2rgbX_c_full;
1880     } else {
1881         c->yuv2packed1  = yuv2packed1_c;
1882         c->yuv2packed2  = yuv2packed2_c;
1883         c->yuv2packedX  = yuv2packedX_c;
1884     }
1885
1886     c->hScale       = hScale_c;
1887
1888     if (c->flags & SWS_FAST_BILINEAR) {
1889         c->hyscale_fast = hyscale_fast_c;
1890         c->hcscale_fast = hcscale_fast_c;
1891     }
1892
1893     c->chrToYV12 = NULL;
1894     switch(srcFormat) {
1895         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
1896         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
1897         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
1898         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
1899         case PIX_FMT_RGB8     :
1900         case PIX_FMT_BGR8     :
1901         case PIX_FMT_PAL8     :
1902         case PIX_FMT_BGR4_BYTE:
1903         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
1904         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
1905         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
1906         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
1907         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
1908         case PIX_FMT_YUV420P16BE:
1909         case PIX_FMT_YUV422P16BE:
1910         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
1911         case PIX_FMT_YUV420P16LE:
1912         case PIX_FMT_YUV422P16LE:
1913         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
1914     }
1915     if (c->chrSrcHSubSample) {
1916         switch(srcFormat) {
1917         case PIX_FMT_RGB48BE: c->chrToYV12 = rgb48BEToUV_half_c; break;
1918         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48LEToUV_half_c; break;
1919         case PIX_FMT_BGR48BE: c->chrToYV12 = bgr48BEToUV_half_c; break;
1920         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48LEToUV_half_c; break;
1921         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half_c;  break;
1922         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half_c; break;
1923         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_half_c; break;
1924         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half_c; break;
1925         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half_c; break;
1926         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half_c;  break;
1927         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half_c; break;
1928         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_half_c; break;
1929         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half_c; break;
1930         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half_c; break;
1931         }
1932     } else {
1933         switch(srcFormat) {
1934         case PIX_FMT_RGB48BE: c->chrToYV12 = rgb48BEToUV_c; break;
1935         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48LEToUV_c; break;
1936         case PIX_FMT_BGR48BE: c->chrToYV12 = bgr48BEToUV_c; break;
1937         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48LEToUV_c; break;
1938         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_c;  break;
1939         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_c; break;
1940         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_c; break;
1941         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_c; break;
1942         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_c; break;
1943         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_c;  break;
1944         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_c; break;
1945         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_c; break;
1946         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_c; break;
1947         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_c; break;
1948         }
1949     }
1950
1951     c->lumToYV12 = NULL;
1952     c->alpToYV12 = NULL;
1953     switch (srcFormat) {
1954     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
1955     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
1956     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
1957     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
1958     case PIX_FMT_YUYV422  :
1959     case PIX_FMT_YUV420P16BE:
1960     case PIX_FMT_YUV422P16BE:
1961     case PIX_FMT_YUV444P16BE:
1962     case PIX_FMT_Y400A    :
1963     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
1964     case PIX_FMT_UYVY422  :
1965     case PIX_FMT_YUV420P16LE:
1966     case PIX_FMT_YUV422P16LE:
1967     case PIX_FMT_YUV444P16LE:
1968     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c; break;
1969     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c; break;
1970     case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY_c; break;
1971     case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY_c; break;
1972     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c; break;
1973     case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY_c; break;
1974     case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY_c; break;
1975     case PIX_FMT_RGB8     :
1976     case PIX_FMT_BGR8     :
1977     case PIX_FMT_PAL8     :
1978     case PIX_FMT_BGR4_BYTE:
1979     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
1980     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
1981     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
1982     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
1983     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
1984     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
1985     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
1986     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
1987     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
1988     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
1989     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
1990     }
1991     if (c->alpPixBuf) {
1992         switch (srcFormat) {
1993         case PIX_FMT_BGRA:
1994         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
1995         case PIX_FMT_ABGR:
1996         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
1997         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
1998         }
1999     }
2000
2001     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2002         if (c->srcRange) {
2003             c->lumConvertRange = lumRangeFromJpeg_c;
2004             c->chrConvertRange = chrRangeFromJpeg_c;
2005         } else {
2006             c->lumConvertRange = lumRangeToJpeg_c;
2007             c->chrConvertRange = chrRangeToJpeg_c;
2008         }
2009     }
2010
2011     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2012           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2013         c->needs_hcscale = 1;
2014 }
2015
2016 SwsFunc ff_getSwsFunc(SwsContext *c)
2017 {
2018     sws_init_swScale_c(c);
2019
2020     if (HAVE_MMX)
2021         ff_sws_init_swScale_mmx(c);
2022     if (HAVE_ALTIVEC)
2023         ff_sws_init_swScale_altivec(c);
2024
2025     return swScale;
2026 }