git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 201                       int chrFilterSize, const int16_t **alpSrc,
 202                       uint16_t *dest, uint16_t *uDest, uint16_t *vDest,
 203                       uint16_t *aDest, int dstW, int chrDstW,
 204                       int big_endian, int output_bits)
 205 {
 206     //FIXME Optimize (just quickly written not optimized..)
 207     int i;
 208     int shift = 11 + 16 - output_bits;
 209
 210 #define output_pixel(pos, val) \
 211     if (big_endian) { \
 212         if (output_bits == 16) { \
 213             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 214         } else { \
 215             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 216         } \
 217     } else { \
 218         if (output_bits == 16) { \
 219             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 220         } else { \
 221             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 222         } \
 223     }
 224     for (i = 0; i < dstW; i++) {
 225         int val = 1 << (26-output_bits);
 226         int j;
 227
 228         for (j = 0; j < lumFilterSize; j++)
 229             val += lumSrc[j][i] * lumFilter[j];
 230
 231         output_pixel(&dest[i], val);
 232     }
 233
 234     if (uDest) {
 235         for (i = 0; i < chrDstW; i++) {
 236             int u = 1 << (26-output_bits);
 237             int v = 1 << (26-output_bits);
 238             int j;
 239
 240             for (j = 0; j < chrFilterSize; j++) {
 241                 u += chrUSrc[j][i] * chrFilter[j];
 242                 v += chrVSrc[j][i] * chrFilter[j];
 243             }
 244
 245             output_pixel(&uDest[i], u);
 246             output_pixel(&vDest[i], v);
 247         }
 248     }
 249
 250     if (CONFIG_SWSCALE_ALPHA && aDest) {
 251         for (i = 0; i < dstW; i++) {
 252             int val = 1 << (26-output_bits);
 253             int j;
 254
 255             for (j = 0; j < lumFilterSize; j++)
 256                 val += alpSrc[j][i] * lumFilter[j];
 257
 258             output_pixel(&aDest[i], val);
 259         }
 260     }
 261 #undef output_pixel
 262 }
 263
 264 #define yuv2NBPS(bits, BE_LE, is_be) \
 265 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 266                               const int16_t **lumSrc, int lumFilterSize, \
 267                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 268                               const int16_t **chrVSrc, \
 269                               int chrFilterSize, const int16_t **alpSrc, \
 270                               uint8_t *_dest, uint8_t *_uDest, uint8_t *_vDest, \
 271                               uint8_t *_aDest, int dstW, int chrDstW) \
 272 { \
 273     uint16_t *dest  = (uint16_t *) _dest,  *uDest = (uint16_t *) _uDest, \
 274              *vDest = (uint16_t *) _vDest, *aDest = (uint16_t *) _aDest; \
 275     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 276                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 277                           alpSrc, \
 278                           dest, uDest, vDest, aDest, \
 279                           dstW, chrDstW, is_be, bits); \
 280 }
 281 yuv2NBPS( 9, BE, 1);
 282 yuv2NBPS( 9, LE, 0);
 283 yuv2NBPS(10, BE, 1);
 284 yuv2NBPS(10, LE, 0);
 285 yuv2NBPS(16, BE, 1);
 286 yuv2NBPS(16, LE, 0);
 287
 288 static inline void yuv2yuvX16_c(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 289                                 const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize,
 290                                 const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, int dstW, int chrDstW,
 291                                 enum PixelFormat dstFormat)
 292 {
 293 #define conv16(bits) \
 294     if (isBE(dstFormat)) { \
 295         yuv2yuvX ## bits ## BE_c(c, lumFilter, lumSrc, lumFilterSize, \
 296                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 297                                  alpSrc, \
 298                                  dest, uDest, vDest, aDest, \
 299                                  dstW, chrDstW); \
 300     } else { \
 301         yuv2yuvX ## bits ## LE_c(c, lumFilter, lumSrc, lumFilterSize, \
 302                                  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 303                                  alpSrc, \
 304                                  dest, uDest, vDest, aDest, \
 305                                  dstW, chrDstW); \
 306     }
 307     if (is16BPS(dstFormat)) {
 308         conv16(16);
 309     } else if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
 310         conv16(9);
 311     } else {
 312         conv16(10);
 313     }
 314 #undef conv16
 315 }
 316
 317 static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 318                               const int16_t **lumSrc, int lumFilterSize,
 319                               const int16_t *chrFilter, const int16_t **chrUSrc,
 320                               const int16_t **chrVSrc,
 321                               int chrFilterSize, const int16_t **alpSrc,
 322                               uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 323                               uint8_t *aDest, int dstW, int chrDstW)
 324 {
 325     //FIXME Optimize (just quickly written not optimized..)
 326     int i;
 327     for (i=0; i<dstW; i++) {
 328         int val=1<<18;
 329         int j;
 330         for (j=0; j<lumFilterSize; j++)
 331             val += lumSrc[j][i] * lumFilter[j];
 332
 333         dest[i]= av_clip_uint8(val>>19);
 334     }
 335
 336     if (uDest)
 337         for (i=0; i<chrDstW; i++) {
 338             int u=1<<18;
 339             int v=1<<18;
 340             int j;
 341             for (j=0; j<chrFilterSize; j++) {
 342                 u += chrUSrc[j][i] * chrFilter[j];
 343                 v += chrVSrc[j][i] * chrFilter[j];
 344             }
 345
 346             uDest[i]= av_clip_uint8(u>>19);
 347             vDest[i]= av_clip_uint8(v>>19);
 348         }
 349
 350     if (CONFIG_SWSCALE_ALPHA && aDest)
 351         for (i=0; i<dstW; i++) {
 352             int val=1<<18;
 353             int j;
 354             for (j=0; j<lumFilterSize; j++)
 355                 val += alpSrc[j][i] * lumFilter[j];
 356
 357             aDest[i]= av_clip_uint8(val>>19);
 358         }
 359
 360 }
 361
 362 static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 363                                const int16_t **lumSrc, int lumFilterSize,
 364                                const int16_t *chrFilter, const int16_t **chrUSrc,
 365                                const int16_t **chrVSrc, int chrFilterSize,
 366                                const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest,
 367                                uint8_t *vDest, uint8_t *aDest,
 368                                int dstW, int chrDstW)
 369 {
 370     enum PixelFormat dstFormat = c->dstFormat;
 371
 372     //FIXME Optimize (just quickly written not optimized..)
 373     int i;
 374     for (i=0; i<dstW; i++) {
 375         int val=1<<18;
 376         int j;
 377         for (j=0; j<lumFilterSize; j++)
 378             val += lumSrc[j][i] * lumFilter[j];
 379
 380         dest[i]= av_clip_uint8(val>>19);
 381     }
 382
 383     if (!uDest)
 384         return;
 385
 386     if (dstFormat == PIX_FMT_NV12)
 387         for (i=0; i<chrDstW; i++) {
 388             int u=1<<18;
 389             int v=1<<18;
 390             int j;
 391             for (j=0; j<chrFilterSize; j++) {
 392                 u += chrUSrc[j][i] * chrFilter[j];
 393                 v += chrVSrc[j][i] * chrFilter[j];
 394             }
 395
 396             uDest[2*i]= av_clip_uint8(u>>19);
 397             uDest[2*i+1]= av_clip_uint8(v>>19);
 398         }
 399     else
 400         for (i=0; i<chrDstW; i++) {
 401             int u=1<<18;
 402             int v=1<<18;
 403             int j;
 404             for (j=0; j<chrFilterSize; j++) {
 405                 u += chrUSrc[j][i] * chrFilter[j];
 406                 v += chrVSrc[j][i] * chrFilter[j];
 407             }
 408
 409             uDest[2*i]= av_clip_uint8(v>>19);
 410             uDest[2*i+1]= av_clip_uint8(u>>19);
 411         }
 412 }
 413
 414 #define YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha) \
 415     for (i=0; i<(dstW>>1); i++) {\
 416         int j;\
 417         int Y1 = 1<<18;\
 418         int Y2 = 1<<18;\
 419         int U  = 1<<18;\
 420         int V  = 1<<18;\
 421         int av_unused A1, A2;\
 422         type av_unused *r, *b, *g;\
 423         const int i2= 2*i;\
 424         \
 425         for (j=0; j<lumFilterSize; j++) {\
 426             Y1 += lumSrc[j][i2] * lumFilter[j];\
 427             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 428         }\
 429         for (j=0; j<chrFilterSize; j++) {\
 430             U += chrUSrc[j][i] * chrFilter[j];\
 431             V += chrVSrc[j][i] * chrFilter[j];\
 432         }\
 433         Y1>>=19;\
 434         Y2>>=19;\
 435         U >>=19;\
 436         V >>=19;\
 437         if (alpha) {\
 438             A1 = 1<<18;\
 439             A2 = 1<<18;\
 440             for (j=0; j<lumFilterSize; j++) {\
 441                 A1 += alpSrc[j][i2  ] * lumFilter[j];\
 442                 A2 += alpSrc[j][i2+1] * lumFilter[j];\
 443             }\
 444             A1>>=19;\
 445             A2>>=19;\
 446         }
 447
 448 #define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
 449         YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha)\
 450         if ((Y1|Y2|U|V)&256) {\
 451             if (Y1>255)   Y1=255; \
 452             else if (Y1<0)Y1=0;   \
 453             if (Y2>255)   Y2=255; \
 454             else if (Y2<0)Y2=0;   \
 455             if (U>255)    U=255;  \
 456             else if (U<0) U=0;    \
 457             if (V>255)    V=255;  \
 458             else if (V<0) V=0;    \
 459         }\
 460         if (alpha && ((A1|A2)&256)) {\
 461             A1=av_clip_uint8(A1);\
 462             A2=av_clip_uint8(A2);\
 463         }
 464
 465 #define YSCALE_YUV_2_PACKEDX_FULL_C(rnd,alpha) \
 466     for (i=0; i<dstW; i++) {\
 467         int j;\
 468         int Y = 0;\
 469         int U = -128<<19;\
 470         int V = -128<<19;\
 471         int av_unused A;\
 472         int R,G,B;\
 473         \
 474         for (j=0; j<lumFilterSize; j++) {\
 475             Y += lumSrc[j][i     ] * lumFilter[j];\
 476         }\
 477         for (j=0; j<chrFilterSize; j++) {\
 478             U += chrUSrc[j][i] * chrFilter[j];\
 479             V += chrVSrc[j][i] * chrFilter[j];\
 480         }\
 481         Y >>=10;\
 482         U >>=10;\
 483         V >>=10;\
 484         if (alpha) {\
 485             A = rnd;\
 486             for (j=0; j<lumFilterSize; j++)\
 487                 A += alpSrc[j][i     ] * lumFilter[j];\
 488             A >>=19;\
 489             if (A&256)\
 490                 A = av_clip_uint8(A);\
 491         }
 492
 493 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
 494     YSCALE_YUV_2_PACKEDX_FULL_C(rnd>>3,alpha)\
 495         Y-= c->yuv2rgb_y_offset;\
 496         Y*= c->yuv2rgb_y_coeff;\
 497         Y+= rnd;\
 498         R= Y + V*c->yuv2rgb_v2r_coeff;\
 499         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
 500         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
 501         if ((R|G|B)&(0xC0000000)) {\
 502             if (R>=(256<<22))   R=(256<<22)-1; \
 503             else if (R<0)R=0;   \
 504             if (G>=(256<<22))   G=(256<<22)-1; \
 505             else if (G<0)G=0;   \
 506             if (B>=(256<<22))   B=(256<<22)-1; \
 507             else if (B<0)B=0;   \
 508         }
 509
 510 #define YSCALE_YUV_2_GRAY16_C \
 511     for (i=0; i<(dstW>>1); i++) {\
 512         int j;\
 513         int Y1 = 1<<18;\
 514         int Y2 = 1<<18;\
 515         int U  = 1<<18;\
 516         int V  = 1<<18;\
 517         \
 518         const int i2= 2*i;\
 519         \
 520         for (j=0; j<lumFilterSize; j++) {\
 521             Y1 += lumSrc[j][i2] * lumFilter[j];\
 522             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 523         }\
 524         Y1>>=11;\
 525         Y2>>=11;\
 526         if ((Y1|Y2|U|V)&65536) {\
 527             if (Y1>65535)   Y1=65535; \
 528             else if (Y1<0)Y1=0;   \
 529             if (Y2>65535)   Y2=65535; \
 530             else if (Y2<0)Y2=0;   \
 531         }
 532
 533 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
 534     YSCALE_YUV_2_PACKEDX_C(type,alpha)  /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
 535     r = (type *)c->table_rV[V];   \
 536     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
 537     b = (type *)c->table_bU[U];
 538
 539 #define YSCALE_YUV_2_PACKED2_C(type,alpha)   \
 540     for (i=0; i<(dstW>>1); i++) { \
 541         const int i2= 2*i;       \
 542         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>19;           \
 543         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;           \
 544         int U= (ubuf0[i]*uvalpha1+ubuf1[i]*uvalpha)>>19;              \
 545         int V= (vbuf0[i]*uvalpha1+vbuf1[i]*uvalpha)>>19;              \
 546         type av_unused *r, *b, *g;                                    \
 547         int av_unused A1, A2;                                         \
 548         if (alpha) {\
 549             A1= (abuf0[i2  ]*yalpha1+abuf1[i2  ]*yalpha)>>19;         \
 550             A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
 551         }
 552
 553 #define YSCALE_YUV_2_GRAY16_2_C   \
 554     for (i=0; i<(dstW>>1); i++) { \
 555         const int i2= 2*i;       \
 556         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>11;           \
 557         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>11;
 558
 559 #define YSCALE_YUV_2_RGB2_C(type,alpha) \
 560     YSCALE_YUV_2_PACKED2_C(type,alpha)\
 561     r = (type *)c->table_rV[V];\
 562     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 563     b = (type *)c->table_bU[U];
 564
 565 #define YSCALE_YUV_2_PACKED1_C(type,alpha) \
 566     for (i=0; i<(dstW>>1); i++) {\
 567         const int i2= 2*i;\
 568         int Y1= buf0[i2  ]>>7;\
 569         int Y2= buf0[i2+1]>>7;\
 570         int U= (ubuf1[i])>>7;\
 571         int V= (vbuf1[i])>>7;\
 572         type av_unused *r, *b, *g;\
 573         int av_unused A1, A2;\
 574         if (alpha) {\
 575             A1= abuf0[i2  ]>>7;\
 576             A2= abuf0[i2+1]>>7;\
 577         }
 578
 579 #define YSCALE_YUV_2_GRAY16_1_C \
 580     for (i=0; i<(dstW>>1); i++) {\
 581         const int i2= 2*i;\
 582         int Y1= buf0[i2  ]<<1;\
 583         int Y2= buf0[i2+1]<<1;
 584
 585 #define YSCALE_YUV_2_RGB1_C(type,alpha) \
 586     YSCALE_YUV_2_PACKED1_C(type,alpha)\
 587     r = (type *)c->table_rV[V];\
 588     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 589     b = (type *)c->table_bU[U];
 590
 591 #define YSCALE_YUV_2_PACKED1B_C(type,alpha) \
 592     for (i=0; i<(dstW>>1); i++) {\
 593         const int i2= 2*i;\
 594         int Y1= buf0[i2  ]>>7;\
 595         int Y2= buf0[i2+1]>>7;\
 596         int U= (ubuf0[i] + ubuf1[i])>>8;\
 597         int V= (vbuf0[i] + vbuf1[i])>>8;\
 598         type av_unused *r, *b, *g;\
 599         int av_unused A1, A2;\
 600         if (alpha) {\
 601             A1= abuf0[i2  ]>>7;\
 602             A2= abuf0[i2+1]>>7;\
 603         }
 604
 605 #define YSCALE_YUV_2_RGB1B_C(type,alpha) \
 606     YSCALE_YUV_2_PACKED1B_C(type,alpha)\
 607     r = (type *)c->table_rV[V];\
 608     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 609     b = (type *)c->table_bU[U];
 610
 611 #define YSCALE_YUV_2_MONO2_C \
 612     const uint8_t * const d128=dither_8x8_220[y&7];\
 613     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 614     for (i=0; i<dstW-7; i+=8) {\
 615         int acc;\
 616         acc =       g[((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19) + d128[0]];\
 617         acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
 618         acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
 619         acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
 620         acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
 621         acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
 622         acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
 623         acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
 624         ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 625         dest++;\
 626     }
 627
 628 #define YSCALE_YUV_2_MONOX_C \
 629     const uint8_t * const d128=dither_8x8_220[y&7];\
 630     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 631     int acc=0;\
 632     for (i=0; i<dstW-1; i+=2) {\
 633         int j;\
 634         int Y1=1<<18;\
 635         int Y2=1<<18;\
 636 \
 637         for (j=0; j<lumFilterSize; j++) {\
 638             Y1 += lumSrc[j][i] * lumFilter[j];\
 639             Y2 += lumSrc[j][i+1] * lumFilter[j];\
 640         }\
 641         Y1>>=19;\
 642         Y2>>=19;\
 643         if ((Y1|Y2)&256) {\
 644             if (Y1>255)   Y1=255;\
 645             else if (Y1<0)Y1=0;\
 646             if (Y2>255)   Y2=255;\
 647             else if (Y2<0)Y2=0;\
 648         }\
 649         acc+= acc + g[Y1+d128[(i+0)&7]];\
 650         acc+= acc + g[Y2+d128[(i+1)&7]];\
 651         if ((i&7)==6) {\
 652             ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 653             dest++;\
 654         }\
 655     }
 656
 657 #define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
 658     switch(c->dstFormat) {\
 659     case PIX_FMT_RGB48BE:\
 660     case PIX_FMT_RGB48LE:\
 661         func(uint8_t,0)\
 662             ((uint8_t*)dest)[ 0]= r[Y1];\
 663             ((uint8_t*)dest)[ 1]= r[Y1];\
 664             ((uint8_t*)dest)[ 2]= g[Y1];\
 665             ((uint8_t*)dest)[ 3]= g[Y1];\
 666             ((uint8_t*)dest)[ 4]= b[Y1];\
 667             ((uint8_t*)dest)[ 5]= b[Y1];\
 668             ((uint8_t*)dest)[ 6]= r[Y2];\
 669             ((uint8_t*)dest)[ 7]= r[Y2];\
 670             ((uint8_t*)dest)[ 8]= g[Y2];\
 671             ((uint8_t*)dest)[ 9]= g[Y2];\
 672             ((uint8_t*)dest)[10]= b[Y2];\
 673             ((uint8_t*)dest)[11]= b[Y2];\
 674             dest+=12;\
 675         }\
 676         break;\
 677     case PIX_FMT_BGR48BE:\
 678     case PIX_FMT_BGR48LE:\
 679         func(uint8_t,0)\
 680             ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
 681             ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
 682             ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
 683             ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
 684             ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
 685             ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
 686             dest+=12;\
 687         }\
 688         break;\
 689     case PIX_FMT_RGBA:\
 690     case PIX_FMT_BGRA:\
 691         if (CONFIG_SMALL) {\
 692             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 693             func(uint32_t,needAlpha)\
 694                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? (A1<<24) : 0);\
 695                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? (A2<<24) : 0);\
 696             }\
 697         } else {\
 698             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 699                 func(uint32_t,1)\
 700                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
 701                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
 702                 }\
 703             } else {\
 704                 func(uint32_t,0)\
 705                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 706                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 707                 }\
 708             }\
 709         }\
 710         break;\
 711     case PIX_FMT_ARGB:\
 712     case PIX_FMT_ABGR:\
 713         if (CONFIG_SMALL) {\
 714             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 715             func(uint32_t,needAlpha)\
 716                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? A1 : 0);\
 717                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? A2 : 0);\
 718             }\
 719         } else {\
 720             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 721                 func(uint32_t,1)\
 722                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + A1;\
 723                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + A2;\
 724                 }\
 725             } else {\
 726                 func(uint32_t,0)\
 727                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 728                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 729                 }\
 730             }\
 731         }                \
 732         break;\
 733     case PIX_FMT_RGB24:\
 734         func(uint8_t,0)\
 735             ((uint8_t*)dest)[0]= r[Y1];\
 736             ((uint8_t*)dest)[1]= g[Y1];\
 737             ((uint8_t*)dest)[2]= b[Y1];\
 738             ((uint8_t*)dest)[3]= r[Y2];\
 739             ((uint8_t*)dest)[4]= g[Y2];\
 740             ((uint8_t*)dest)[5]= b[Y2];\
 741             dest+=6;\
 742         }\
 743         break;\
 744     case PIX_FMT_BGR24:\
 745         func(uint8_t,0)\
 746             ((uint8_t*)dest)[0]= b[Y1];\
 747             ((uint8_t*)dest)[1]= g[Y1];\
 748             ((uint8_t*)dest)[2]= r[Y1];\
 749             ((uint8_t*)dest)[3]= b[Y2];\
 750             ((uint8_t*)dest)[4]= g[Y2];\
 751             ((uint8_t*)dest)[5]= r[Y2];\
 752             dest+=6;\
 753         }\
 754         break;\
 755     case PIX_FMT_RGB565BE:\
 756     case PIX_FMT_RGB565LE:\
 757     case PIX_FMT_BGR565BE:\
 758     case PIX_FMT_BGR565LE:\
 759         {\
 760             const int dr1= dither_2x2_8[y&1    ][0];\
 761             const int dg1= dither_2x2_4[y&1    ][0];\
 762             const int db1= dither_2x2_8[(y&1)^1][0];\
 763             const int dr2= dither_2x2_8[y&1    ][1];\
 764             const int dg2= dither_2x2_4[y&1    ][1];\
 765             const int db2= dither_2x2_8[(y&1)^1][1];\
 766             func(uint16_t,0)\
 767                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 768                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 769             }\
 770         }\
 771         break;\
 772     case PIX_FMT_RGB555BE:\
 773     case PIX_FMT_RGB555LE:\
 774     case PIX_FMT_BGR555BE:\
 775     case PIX_FMT_BGR555LE:\
 776         {\
 777             const int dr1= dither_2x2_8[y&1    ][0];\
 778             const int dg1= dither_2x2_8[y&1    ][1];\
 779             const int db1= dither_2x2_8[(y&1)^1][0];\
 780             const int dr2= dither_2x2_8[y&1    ][1];\
 781             const int dg2= dither_2x2_8[y&1    ][0];\
 782             const int db2= dither_2x2_8[(y&1)^1][1];\
 783             func(uint16_t,0)\
 784                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 785                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 786             }\
 787         }\
 788         break;\
 789     case PIX_FMT_RGB444BE:\
 790     case PIX_FMT_RGB444LE:\
 791     case PIX_FMT_BGR444BE:\
 792     case PIX_FMT_BGR444LE:\
 793         {\
 794             const int dr1= dither_4x4_16[y&3    ][0];\
 795             const int dg1= dither_4x4_16[y&3    ][1];\
 796             const int db1= dither_4x4_16[(y&3)^3][0];\
 797             const int dr2= dither_4x4_16[y&3    ][1];\
 798             const int dg2= dither_4x4_16[y&3    ][0];\
 799             const int db2= dither_4x4_16[(y&3)^3][1];\
 800             func(uint16_t,0)\
 801                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 802                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 803             }\
 804         }\
 805         break;\
 806     case PIX_FMT_RGB8:\
 807     case PIX_FMT_BGR8:\
 808         {\
 809             const uint8_t * const d64= dither_8x8_73[y&7];\
 810             const uint8_t * const d32= dither_8x8_32[y&7];\
 811             func(uint8_t,0)\
 812                 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
 813                 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
 814             }\
 815         }\
 816         break;\
 817     case PIX_FMT_RGB4:\
 818     case PIX_FMT_BGR4:\
 819         {\
 820             const uint8_t * const d64= dither_8x8_73 [y&7];\
 821             const uint8_t * const d128=dither_8x8_220[y&7];\
 822             func(uint8_t,0)\
 823                 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
 824                                  + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
 825             }\
 826         }\
 827         break;\
 828     case PIX_FMT_RGB4_BYTE:\
 829     case PIX_FMT_BGR4_BYTE:\
 830         {\
 831             const uint8_t * const d64= dither_8x8_73 [y&7];\
 832             const uint8_t * const d128=dither_8x8_220[y&7];\
 833             func(uint8_t,0)\
 834                 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
 835                 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
 836             }\
 837         }\
 838         break;\
 839     case PIX_FMT_MONOBLACK:\
 840     case PIX_FMT_MONOWHITE:\
 841         {\
 842             func_monoblack\
 843         }\
 844         break;\
 845     case PIX_FMT_YUYV422:\
 846         func2\
 847             ((uint8_t*)dest)[2*i2+0]= Y1;\
 848             ((uint8_t*)dest)[2*i2+1]= U;\
 849             ((uint8_t*)dest)[2*i2+2]= Y2;\
 850             ((uint8_t*)dest)[2*i2+3]= V;\
 851         }                \
 852         break;\
 853     case PIX_FMT_UYVY422:\
 854         func2\
 855             ((uint8_t*)dest)[2*i2+0]= U;\
 856             ((uint8_t*)dest)[2*i2+1]= Y1;\
 857             ((uint8_t*)dest)[2*i2+2]= V;\
 858             ((uint8_t*)dest)[2*i2+3]= Y2;\
 859         }                \
 860         break;\
 861     case PIX_FMT_GRAY16BE:\
 862         func_g16\
 863             ((uint8_t*)dest)[2*i2+0]= Y1>>8;\
 864             ((uint8_t*)dest)[2*i2+1]= Y1;\
 865             ((uint8_t*)dest)[2*i2+2]= Y2>>8;\
 866             ((uint8_t*)dest)[2*i2+3]= Y2;\
 867         }                \
 868         break;\
 869     case PIX_FMT_GRAY16LE:\
 870         func_g16\
 871             ((uint8_t*)dest)[2*i2+0]= Y1;\
 872             ((uint8_t*)dest)[2*i2+1]= Y1>>8;\
 873             ((uint8_t*)dest)[2*i2+2]= Y2;\
 874             ((uint8_t*)dest)[2*i2+3]= Y2>>8;\
 875         }                \
 876         break;\
 877     }
 878
 879 static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
 880                           const int16_t **lumSrc, int lumFilterSize,
 881                           const int16_t *chrFilter, const int16_t **chrUSrc,
 882                           const int16_t **chrVSrc, int chrFilterSize,
 883                           const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 884 {
 885     int i;
 886     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
 887 }
 888
 889 static inline void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
 890                                    const int16_t **lumSrc, int lumFilterSize,
 891                                    const int16_t *chrFilter, const int16_t **chrUSrc,
 892                                    const int16_t **chrVSrc, int chrFilterSize,
 893                                    const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 894 {
 895     int i;
 896     int step= c->dstFormatBpp/8;
 897     int aidx= 3;
 898
 899     switch(c->dstFormat) {
 900     case PIX_FMT_ARGB:
 901         dest++;
 902         aidx= 0;
 903     case PIX_FMT_RGB24:
 904         aidx--;
 905     case PIX_FMT_RGBA:
 906         if (CONFIG_SMALL) {
 907             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 908             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 909                 dest[aidx]= needAlpha ? A : 255;
 910                 dest[0]= R>>22;
 911                 dest[1]= G>>22;
 912                 dest[2]= B>>22;
 913                 dest+= step;
 914             }
 915         } else {
 916             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 917                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 918                     dest[aidx]= A;
 919                     dest[0]= R>>22;
 920                     dest[1]= G>>22;
 921                     dest[2]= B>>22;
 922                     dest+= step;
 923                 }
 924             } else {
 925                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 926                     dest[aidx]= 255;
 927                     dest[0]= R>>22;
 928                     dest[1]= G>>22;
 929                     dest[2]= B>>22;
 930                     dest+= step;
 931                 }
 932             }
 933         }
 934         break;
 935     case PIX_FMT_ABGR:
 936         dest++;
 937         aidx= 0;
 938     case PIX_FMT_BGR24:
 939         aidx--;
 940     case PIX_FMT_BGRA:
 941         if (CONFIG_SMALL) {
 942             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 943             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 944                 dest[aidx]= needAlpha ? A : 255;
 945                 dest[0]= B>>22;
 946                 dest[1]= G>>22;
 947                 dest[2]= R>>22;
 948                 dest+= step;
 949             }
 950         } else {
 951             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 952                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 953                     dest[aidx]= A;
 954                     dest[0]= B>>22;
 955                     dest[1]= G>>22;
 956                     dest[2]= R>>22;
 957                     dest+= step;
 958                 }
 959             } else {
 960                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 961                     dest[aidx]= 255;
 962                     dest[0]= B>>22;
 963                     dest[1]= G>>22;
 964                     dest[2]= R>>22;
 965                     dest+= step;
 966                 }
 967             }
 968         }
 969         break;
 970     default:
 971         assert(0);
 972     }
 973 }
 974
 975 static void fillPlane(uint8_t* plane, int stride, int width, int height, int y, uint8_t val)
 976 {
 977     int i;
 978     uint8_t *ptr = plane + stride*y;
 979     for (i=0; i<height; i++) {
 980         memset(ptr, val, width);
 981         ptr += stride;
 982     }
 983 }
 984
 985 #define rgb48funcs(LE_BE, rfunc, compA, compB, compC) \
 986 static void compA ## compB ## compC ## 48 ## LE_BE ## ToY_c( \
 987                        uint8_t *dst, const uint8_t *src, int width, \
 988                        uint32_t *unused) \
 989 { \
 990     int i; \
 991     for (i = 0; i < width; i++) { \
 992         int compA = rfunc(&src[i*6+0]) >> 8; \
 993         int compB = rfunc(&src[i*6+2]) >> 8; \
 994         int compC = rfunc(&src[i*6+4]) >> 8; \
 995  \
 996         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
 997     } \
 998 } \
 999  \
1000 static void compA ## compB ## compC ## 48 ## LE_BE ## ToUV_c( \
1001                         uint8_t *dstU, uint8_t *dstV, \
1002                         const uint8_t *src1, const uint8_t *src2, \
1003                         int width, uint32_t *unused) \
1004 { \
1005     int i; \
1006     assert(src1==src2); \
1007     for (i = 0; i < width; i++) { \
1008         int compA = rfunc(&src1[6*i + 0]) >> 8; \
1009         int compB = rfunc(&src1[6*i + 2]) >> 8; \
1010         int compC = rfunc(&src1[6*i + 4]) >> 8; \
1011  \
1012         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
1013         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; \
1014     } \
1015 } \
1016  \
1017 static void compA ## compB ## compC ## 48 ## LE_BE ## ToUV_half_c( \
1018                             uint8_t *dstU, uint8_t *dstV, \
1019                             const uint8_t *src1, const uint8_t *src2, \
1020                             int width, uint32_t *unused) \
1021 { \
1022     int i; \
1023     assert(src1==src2); \
1024     for (i = 0; i < width; i++) { \
1025         int compA = (rfunc(&src1[12*i + 0]) >> 8) + (rfunc(&src1[12*i + 6]) >> 8); \
1026         int compB = (rfunc(&src1[12*i + 2]) >> 8) + (rfunc(&src1[12*i + 8]) >> 8); \
1027         int compC = (rfunc(&src1[12*i + 4]) >> 8) + (rfunc(&src1[12*i + 10]) >> 8); \
1028  \
1029         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1); \
1030         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1); \
1031     } \
1032 }
1033 rgb48funcs(LE, AV_RL16, r, g, b);
1034 rgb48funcs(BE, AV_RB16, r, g, b);
1035 rgb48funcs(LE, AV_RL16, b, g, r);
1036 rgb48funcs(BE, AV_RB16, b, g, r);
1037
1038 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1039 static void name ## _c(uint8_t *dst, const uint8_t *src, \
1040                        int width, uint32_t *unused)\
1041 {\
1042     int i;\
1043     for (i=0; i<width; i++) {\
1044         int b= (((const type*)src)[i]>>shb)&maskb;\
1045         int g= (((const type*)src)[i]>>shg)&maskg;\
1046         int r= (((const type*)src)[i]>>shr)&maskr;\
1047 \
1048         dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1049     }\
1050 }
1051
1052 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1053 BGR2Y(uint32_t,bgr321ToY,16,16, 0, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1054 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1055 BGR2Y(uint32_t,rgb321ToY, 0,16,16, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1056 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1057 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1058 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1059 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1060
1061 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1062 {
1063     int i;
1064     for (i=0; i<width; i++) {
1065         dst[i]= src[4*i];
1066     }
1067 }
1068
1069 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1070 {
1071     int i;
1072     for (i=0; i<width; i++) {
1073         dst[i]= src[4*i+3];
1074     }
1075 }
1076
1077 #define BGR2UV(type, name, shr, shg, shb, shp, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S) \
1078 static void name ## _c(uint8_t *dstU, uint8_t *dstV, \
1079                        const uint8_t *src, const uint8_t *dummy, \
1080                        int width, uint32_t *unused)\
1081 {\
1082     int i;\
1083     for (i=0; i<width; i++) {\
1084         int b= ((((const type*)src)[i]>>shp)&maskb)>>shb;\
1085         int g= ((((const type*)src)[i]>>shp)&maskg)>>shg;\
1086         int r= ((((const type*)src)[i]>>shp)&maskr)>>shr;\
1087 \
1088         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1089         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1090     }\
1091 }\
1092 static void name ## _half_c(uint8_t *dstU, uint8_t *dstV, \
1093                             const uint8_t *src, const uint8_t *dummy, \
1094                             int width, uint32_t *unused)\
1095 {\
1096     int i;\
1097     for (i=0; i<width; i++) {\
1098         int pix0= ((const type*)src)[2*i+0]>>shp;\
1099         int pix1= ((const type*)src)[2*i+1]>>shp;\
1100         int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1101         int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1102         int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1103         g&= maskg|(2*maskg);\
1104 \
1105         g>>=shg;\
1106 \
1107         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1108         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1109     }\
1110 }
1111
1112 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1113 BGR2UV(uint32_t,bgr321ToUV,16, 0, 0, 8, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1114 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1115 BGR2UV(uint32_t,rgb321ToUV, 0, 0,16, 8,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1116 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1117 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1118 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1119 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1120
1121 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1122 {
1123     int i;
1124     for (i=0; i<width; i++) {
1125         int d= src[i];
1126
1127         dst[i]= pal[d] & 0xFF;
1128     }
1129 }
1130
1131 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1132                       const uint8_t *src1, const uint8_t *src2,
1133                       int width, uint32_t *pal)
1134 {
1135     int i;
1136     assert(src1 == src2);
1137     for (i=0; i<width; i++) {
1138         int p= pal[src1[i]];
1139
1140         dstU[i]= p>>8;
1141         dstV[i]= p>>16;
1142     }
1143 }
1144
1145 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1146                           int width, uint32_t *unused)
1147 {
1148     int i, j;
1149     for (i=0; i<width/8; i++) {
1150         int d= ~src[i];
1151         for(j=0; j<8; j++)
1152             dst[8*i+j]= ((d>>(7-j))&1)*255;
1153     }
1154 }
1155
1156 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1157                           int width, uint32_t *unused)
1158 {
1159     int i, j;
1160     for (i=0; i<width/8; i++) {
1161         int d= src[i];
1162         for(j=0; j<8; j++)
1163             dst[8*i+j]= ((d>>(7-j))&1)*255;
1164     }
1165 }
1166
1167 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
1168                        const int16_t *chrUSrc, const int16_t *chrVSrc,
1169                        const int16_t *alpSrc,
1170                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
1171                        uint8_t *aDest, int dstW, int chrDstW)
1172 {
1173     int i;
1174     for (i=0; i<dstW; i++) {
1175         int val= (lumSrc[i]+64)>>7;
1176         dest[i]= av_clip_uint8(val);
1177     }
1178
1179     if (uDest)
1180         for (i=0; i<chrDstW; i++) {
1181             int u=(chrUSrc[i]+64)>>7;
1182             int v=(chrVSrc[i]+64)>>7;
1183             uDest[i]= av_clip_uint8(u);
1184             vDest[i]= av_clip_uint8(v);
1185         }
1186
1187     if (CONFIG_SWSCALE_ALPHA && aDest)
1188         for (i=0; i<dstW; i++) {
1189             int val= (alpSrc[i]+64)>>7;
1190             aDest[i]= av_clip_uint8(val);
1191         }
1192 }
1193
1194 /**
1195  * vertical bilinear scale YV12 to RGB
1196  */
1197 static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
1198                           const uint16_t *buf1, const uint16_t *ubuf0,
1199                           const uint16_t *ubuf1, const uint16_t *vbuf0,
1200                           const uint16_t *vbuf1, const uint16_t *abuf0,
1201                           const uint16_t *abuf1, uint8_t *dest, int dstW,
1202                           int yalpha, int uvalpha, int y)
1203 {
1204     int  yalpha1=4095- yalpha;
1205     int uvalpha1=4095-uvalpha;
1206     int i;
1207
1208     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1209 }
1210
1211 /**
1212  * YV12 to RGB without scaling or interpolating
1213  */
1214 static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
1215                           const uint16_t *ubuf0, const uint16_t *ubuf1,
1216                           const uint16_t *vbuf0, const uint16_t *vbuf1,
1217                           const uint16_t *abuf0, uint8_t *dest, int dstW,
1218                           int uvalpha, enum PixelFormat dstFormat,
1219                           int flags, int y)
1220 {
1221     const int yalpha1=0;
1222     int i;
1223
1224     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1225     const int yalpha= 4096; //FIXME ...
1226
1227     if (uvalpha < 2048) {
1228         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1229     } else {
1230         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1231     }
1232 }
1233
1234 //FIXME yuy2* can read up to 7 samples too much
1235
1236 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1237                       uint32_t *unused)
1238 {
1239     int i;
1240     for (i=0; i<width; i++)
1241         dst[i]= src[2*i];
1242 }
1243
1244 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1245                        const uint8_t *src2, int width, uint32_t *unused)
1246 {
1247     int i;
1248     for (i=0; i<width; i++) {
1249         dstU[i]= src1[4*i + 1];
1250         dstV[i]= src1[4*i + 3];
1251     }
1252     assert(src1 == src2);
1253 }
1254
1255 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1256                      const uint8_t *src2, int width, uint32_t *unused)
1257 {
1258     int i;
1259     for (i=0; i<width; i++) {
1260         dstU[i]= src1[2*i + 1];
1261         dstV[i]= src2[2*i + 1];
1262     }
1263 }
1264
1265 /* This is almost identical to the previous, end exists only because
1266  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1267 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1268                       uint32_t *unused)
1269 {
1270     int i;
1271     for (i=0; i<width; i++)
1272         dst[i]= src[2*i+1];
1273 }
1274
1275 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1276                        const uint8_t *src2, int width, uint32_t *unused)
1277 {
1278     int i;
1279     for (i=0; i<width; i++) {
1280         dstU[i]= src1[4*i + 0];
1281         dstV[i]= src1[4*i + 2];
1282     }
1283     assert(src1 == src2);
1284 }
1285
1286 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1287                      const uint8_t *src2, int width, uint32_t *unused)
1288 {
1289     int i;
1290     for (i=0; i<width; i++) {
1291         dstU[i]= src1[2*i];
1292         dstV[i]= src2[2*i];
1293     }
1294 }
1295
1296 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1297                                         const uint8_t *src, int width)
1298 {
1299     int i;
1300     for (i = 0; i < width; i++) {
1301         dst1[i] = src[2*i+0];
1302         dst2[i] = src[2*i+1];
1303     }
1304 }
1305
1306 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1307                        const uint8_t *src1, const uint8_t *src2,
1308                        int width, uint32_t *unused)
1309 {
1310     nvXXtoUV_c(dstU, dstV, src1, width);
1311 }
1312
1313 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1314                        const uint8_t *src1, const uint8_t *src2,
1315                        int width, uint32_t *unused)
1316 {
1317     nvXXtoUV_c(dstV, dstU, src1, width);
1318 }
1319
1320 // FIXME Maybe dither instead.
1321 #define YUV_NBPS(depth, endianness, rfunc) \
1322 static void endianness ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1323                                           const uint8_t *_srcU, const uint8_t *_srcV, \
1324                                           int width, uint32_t *unused) \
1325 { \
1326     int i; \
1327     const uint16_t *srcU = (const uint16_t*)_srcU; \
1328     const uint16_t *srcV = (const uint16_t*)_srcV; \
1329     for (i = 0; i < width; i++) { \
1330         dstU[i] = rfunc(&srcU[i])>>(depth-8); \
1331         dstV[i] = rfunc(&srcV[i])>>(depth-8); \
1332     } \
1333 } \
1334 \
1335 static void endianness ## depth ## ToY_c(uint8_t *dstY, const uint8_t *_srcY, \
1336                                          int width, uint32_t *unused) \
1337 { \
1338     int i; \
1339     const uint16_t *srcY = (const uint16_t*)_srcY; \
1340     for (i = 0; i < width; i++) \
1341         dstY[i] = rfunc(&srcY[i])>>(depth-8); \
1342 } \
1343
1344 YUV_NBPS( 9, LE, AV_RL16)
1345 YUV_NBPS( 9, BE, AV_RB16)
1346 YUV_NBPS(10, LE, AV_RL16)
1347 YUV_NBPS(10, BE, AV_RB16)
1348
1349 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1350                        int width, uint32_t *unused)
1351 {
1352     int i;
1353     for (i=0; i<width; i++) {
1354         int b= src[i*3+0];
1355         int g= src[i*3+1];
1356         int r= src[i*3+2];
1357
1358         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1359     }
1360 }
1361
1362 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1363                         const uint8_t *src2, int width, uint32_t *unused)
1364 {
1365     int i;
1366     for (i=0; i<width; i++) {
1367         int b= src1[3*i + 0];
1368         int g= src1[3*i + 1];
1369         int r= src1[3*i + 2];
1370
1371         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1372         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1373     }
1374     assert(src1 == src2);
1375 }
1376
1377 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1378                              const uint8_t *src2, int width, uint32_t *unused)
1379 {
1380     int i;
1381     for (i=0; i<width; i++) {
1382         int b= src1[6*i + 0] + src1[6*i + 3];
1383         int g= src1[6*i + 1] + src1[6*i + 4];
1384         int r= src1[6*i + 2] + src1[6*i + 5];
1385
1386         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1387         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1388     }
1389     assert(src1 == src2);
1390 }
1391
1392 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1393                        uint32_t *unused)
1394 {
1395     int i;
1396     for (i=0; i<width; i++) {
1397         int r= src[i*3+0];
1398         int g= src[i*3+1];
1399         int b= src[i*3+2];
1400
1401         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1402     }
1403 }
1404
1405 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1406                         const uint8_t *src2, int width, uint32_t *unused)
1407 {
1408     int i;
1409     assert(src1==src2);
1410     for (i=0; i<width; i++) {
1411         int r= src1[3*i + 0];
1412         int g= src1[3*i + 1];
1413         int b= src1[3*i + 2];
1414
1415         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1416         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1417     }
1418 }
1419
1420 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1421                              const uint8_t *src2, int width, uint32_t *unused)
1422 {
1423     int i;
1424     assert(src1==src2);
1425     for (i=0; i<width; i++) {
1426         int r= src1[6*i + 0] + src1[6*i + 3];
1427         int g= src1[6*i + 1] + src1[6*i + 4];
1428         int b= src1[6*i + 2] + src1[6*i + 5];
1429
1430         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1431         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1432     }
1433 }
1434
1435
1436 // bilinear / bicubic scaling
1437 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1438                      int srcW, int xInc,
1439                      const int16_t *filter, const int16_t *filterPos,
1440                      int filterSize)
1441 {
1442     int i;
1443     for (i=0; i<dstW; i++) {
1444         int j;
1445         int srcPos= filterPos[i];
1446         int val=0;
1447         for (j=0; j<filterSize; j++) {
1448             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1449         }
1450         //filter += hFilterSize;
1451         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1452         //dst[i] = val>>7;
1453     }
1454 }
1455
1456 //FIXME all pal and rgb srcFormats could do this convertion as well
1457 //FIXME all scalers more complex than bilinear could do half of this transform
1458 static void chrRangeToJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1459 {
1460     int i;
1461     for (i = 0; i < width; i++) {
1462         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1463         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1464     }
1465 }
1466 static void chrRangeFromJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1467 {
1468     int i;
1469     for (i = 0; i < width; i++) {
1470         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1471         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1472     }
1473 }
1474 static void lumRangeToJpeg_c(uint16_t *dst, int width)
1475 {
1476     int i;
1477     for (i = 0; i < width; i++)
1478         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1479 }
1480 static void lumRangeFromJpeg_c(uint16_t *dst, int width)
1481 {
1482     int i;
1483     for (i = 0; i < width; i++)
1484         dst[i] = (dst[i]*14071 + 33561947)>>14;
1485 }
1486
1487 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1488                            const uint8_t *src, int srcW, int xInc)
1489 {
1490     int i;
1491     unsigned int xpos=0;
1492     for (i=0;i<dstWidth;i++) {
1493         register unsigned int xx=xpos>>16;
1494         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1495         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1496         xpos+=xInc;
1497     }
1498 }
1499
1500 // *** horizontal scale Y line to temp buffer
1501 static inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1502                            const uint8_t *src, int srcW, int xInc,
1503                            const int16_t *hLumFilter,
1504                            const int16_t *hLumFilterPos, int hLumFilterSize,
1505                            uint8_t *formatConvBuffer,
1506                            uint32_t *pal, int isAlpha)
1507 {
1508     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1509     void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1510
1511     if (toYV12) {
1512         toYV12(formatConvBuffer, src, srcW, pal);
1513         src= formatConvBuffer;
1514     }
1515
1516     if (!c->hyscale_fast) {
1517         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1518     } else { // fast bilinear upscale / crap downscale
1519         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1520     }
1521
1522     if (convertRange)
1523         convertRange(dst, dstWidth);
1524 }
1525
1526 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1527                            int dstWidth, const uint8_t *src1,
1528                            const uint8_t *src2, int srcW, int xInc)
1529 {
1530     int i;
1531     unsigned int xpos=0;
1532     for (i=0;i<dstWidth;i++) {
1533         register unsigned int xx=xpos>>16;
1534         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1535         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1536         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1537         xpos+=xInc;
1538     }
1539 }
1540
1541 static inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1542                            const uint8_t *src1, const uint8_t *src2,
1543                            int srcW, int xInc, const int16_t *hChrFilter,
1544                            const int16_t *hChrFilterPos, int hChrFilterSize,
1545                            uint8_t *formatConvBuffer, uint32_t *pal)
1546 {
1547     if (c->chrToYV12) {
1548         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1549         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1550         src1= formatConvBuffer;
1551         src2= buf2;
1552     }
1553
1554     if (!c->hcscale_fast) {
1555         c->hScale(dst1, dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1556         c->hScale(dst2, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1557     } else { // fast bilinear upscale / crap downscale
1558         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1559     }
1560
1561     if (c->chrConvertRange)
1562         c->chrConvertRange(dst1, dst2, dstWidth);
1563 }
1564
1565 static av_always_inline void
1566 find_c_packed_planar_out_funcs(SwsContext *c,
1567                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
1568                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
1569                                yuv2packedX_fn *yuv2packedX)
1570 {
1571     enum PixelFormat dstFormat = c->dstFormat;
1572
1573     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1574         *yuv2yuvX     = yuv2nv12X_c;
1575     } else if (is16BPS(dstFormat)) {
1576         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
1577     } else if (is9_OR_10BPS(dstFormat)) {
1578         if (dstFormat == PIX_FMT_YUV420P9BE || dstFormat == PIX_FMT_YUV420P9LE) {
1579             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
1580         } else {
1581             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
1582         }
1583     } else {
1584         *yuv2yuv1     = yuv2yuv1_c;
1585         *yuv2yuvX     = yuv2yuvX_c;
1586     }
1587     if(c->flags & SWS_FULL_CHR_H_INT) {
1588         *yuv2packedX = yuv2rgbX_c_full;
1589     } else {
1590         *yuv2packed1  = yuv2packed1_c;
1591         *yuv2packed2  = yuv2packed2_c;
1592         *yuv2packedX  = yuv2packedX_c;
1593     }
1594 }
1595
1596 #define DEBUG_SWSCALE_BUFFERS 0
1597 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
1598
1599 static int swScale(SwsContext *c, const uint8_t* src[],
1600                    int srcStride[], int srcSliceY,
1601                    int srcSliceH, uint8_t* dst[], int dstStride[])
1602 {
1603     /* load a few things into local vars to make the code more readable? and faster */
1604     const int srcW= c->srcW;
1605     const int dstW= c->dstW;
1606     const int dstH= c->dstH;
1607     const int chrDstW= c->chrDstW;
1608     const int chrSrcW= c->chrSrcW;
1609     const int lumXInc= c->lumXInc;
1610     const int chrXInc= c->chrXInc;
1611     const enum PixelFormat dstFormat= c->dstFormat;
1612     const int flags= c->flags;
1613     int16_t *vLumFilterPos= c->vLumFilterPos;
1614     int16_t *vChrFilterPos= c->vChrFilterPos;
1615     int16_t *hLumFilterPos= c->hLumFilterPos;
1616     int16_t *hChrFilterPos= c->hChrFilterPos;
1617     int16_t *vLumFilter= c->vLumFilter;
1618     int16_t *vChrFilter= c->vChrFilter;
1619     int16_t *hLumFilter= c->hLumFilter;
1620     int16_t *hChrFilter= c->hChrFilter;
1621     int32_t *lumMmxFilter= c->lumMmxFilter;
1622     int32_t *chrMmxFilter= c->chrMmxFilter;
1623     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
1624     const int vLumFilterSize= c->vLumFilterSize;
1625     const int vChrFilterSize= c->vChrFilterSize;
1626     const int hLumFilterSize= c->hLumFilterSize;
1627     const int hChrFilterSize= c->hChrFilterSize;
1628     int16_t **lumPixBuf= c->lumPixBuf;
1629     int16_t **chrUPixBuf= c->chrUPixBuf;
1630     int16_t **chrVPixBuf= c->chrVPixBuf;
1631     int16_t **alpPixBuf= c->alpPixBuf;
1632     const int vLumBufSize= c->vLumBufSize;
1633     const int vChrBufSize= c->vChrBufSize;
1634     uint8_t *formatConvBuffer= c->formatConvBuffer;
1635     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
1636     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
1637     int lastDstY;
1638     uint32_t *pal=c->pal_yuv;
1639     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
1640     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
1641     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
1642     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
1643     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
1644
1645     /* vars which will change and which we need to store back in the context */
1646     int dstY= c->dstY;
1647     int lumBufIndex= c->lumBufIndex;
1648     int chrBufIndex= c->chrBufIndex;
1649     int lastInLumBuf= c->lastInLumBuf;
1650     int lastInChrBuf= c->lastInChrBuf;
1651
1652     if (isPacked(c->srcFormat)) {
1653         src[0]=
1654         src[1]=
1655         src[2]=
1656         src[3]= src[0];
1657         srcStride[0]=
1658         srcStride[1]=
1659         srcStride[2]=
1660         srcStride[3]= srcStride[0];
1661     }
1662     srcStride[1]<<= c->vChrDrop;
1663     srcStride[2]<<= c->vChrDrop;
1664
1665     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
1666                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
1667                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
1668     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
1669                    srcSliceY,    srcSliceH,    dstY,    dstH);
1670     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
1671                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
1672
1673     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
1674         static int warnedAlready=0; //FIXME move this into the context perhaps
1675         if (flags & SWS_PRINT_INFO && !warnedAlready) {
1676             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
1677                    "         ->cannot do aligned memory accesses anymore\n");
1678             warnedAlready=1;
1679         }
1680     }
1681
1682     /* Note the user might start scaling the picture in the middle so this
1683        will not get executed. This is not really intended but works
1684        currently, so people might do it. */
1685     if (srcSliceY ==0) {
1686         lumBufIndex=-1;
1687         chrBufIndex=-1;
1688         dstY=0;
1689         lastInLumBuf= -1;
1690         lastInChrBuf= -1;
1691     }
1692
1693     lastDstY= dstY;
1694
1695     for (;dstY < dstH; dstY++) {
1696         unsigned char *dest =dst[0]+dstStride[0]*dstY;
1697         const int chrDstY= dstY>>c->chrDstVSubSample;
1698         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
1699         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
1700         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
1701
1702         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
1703         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
1704         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
1705         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
1706         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
1707         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
1708         int enough_lines;
1709
1710         //handle holes (FAST_BILINEAR & weird filters)
1711         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1712         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1713         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
1714         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
1715
1716         DEBUG_BUFFERS("dstY: %d\n", dstY);
1717         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
1718                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
1719         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
1720                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
1721
1722         // Do we have enough lines in this slice to output the dstY line
1723         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
1724
1725         if (!enough_lines) {
1726             lastLumSrcY = srcSliceY + srcSliceH - 1;
1727             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
1728             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
1729                                             lastLumSrcY, lastChrSrcY);
1730         }
1731
1732         //Do horizontal scaling
1733         while(lastInLumBuf < lastLumSrcY) {
1734             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
1735             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
1736             lumBufIndex++;
1737             assert(lumBufIndex < 2*vLumBufSize);
1738             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
1739             assert(lastInLumBuf + 1 - srcSliceY >= 0);
1740             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
1741                     hLumFilter, hLumFilterPos, hLumFilterSize,
1742                     formatConvBuffer,
1743                     pal, 0);
1744             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
1745                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
1746                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
1747                         formatConvBuffer,
1748                         pal, 1);
1749             lastInLumBuf++;
1750             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
1751                                lumBufIndex,    lastInLumBuf);
1752         }
1753         while(lastInChrBuf < lastChrSrcY) {
1754             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
1755             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
1756             chrBufIndex++;
1757             assert(chrBufIndex < 2*vChrBufSize);
1758             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
1759             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
1760             //FIXME replace parameters through context struct (some at least)
1761
1762             if (c->needs_hcscale)
1763                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
1764                           chrDstW, src1, src2, chrSrcW, chrXInc,
1765                           hChrFilter, hChrFilterPos, hChrFilterSize,
1766                           formatConvBuffer, pal);
1767             lastInChrBuf++;
1768             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
1769                                chrBufIndex,    lastInChrBuf);
1770         }
1771         //wrap buf index around to stay inside the ring buffer
1772         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
1773         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
1774         if (!enough_lines)
1775             break; //we can't output a dstY line so let's try with the next slice
1776
1777 #if HAVE_MMX
1778         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
1779 #endif
1780         if (dstY >= dstH-2) {
1781             // hmm looks like we can't use MMX here without overwriting this array's tail
1782             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
1783                                            &yuv2packed1, &yuv2packed2,
1784                                            &yuv2packedX);
1785         }
1786
1787         {
1788             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1789             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1790             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1791             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1792             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
1793                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1794                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1795                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
1796                     const int16_t *lumBuf = lumSrcPtr[0];
1797                     const int16_t *chrUBuf= chrUSrcPtr[0];
1798                     const int16_t *chrVBuf= chrVSrcPtr[0];
1799                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
1800                     yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
1801                                 uDest, vDest, aDest, dstW, chrDstW);
1802                 } else { //General YV12
1803                     yuv2yuvX(c,
1804                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1805                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
1806                                 chrVSrcPtr, vChrFilterSize,
1807                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
1808                 }
1809             } else {
1810                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
1811                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1812                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
1813                     int chrAlpha= vChrFilter[2*dstY+1];
1814                     yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
1815                                    *chrVSrcPtr, *(chrVSrcPtr+1),
1816                                    alpPixBuf ? *alpSrcPtr : NULL,
1817                                    dest, dstW, chrAlpha, dstFormat, flags, dstY);
1818                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
1819                     int lumAlpha= vLumFilter[2*dstY+1];
1820                     int chrAlpha= vChrFilter[2*dstY+1];
1821                     lumMmxFilter[2]=
1822                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
1823                     chrMmxFilter[2]=
1824                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
1825                     yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
1826                                    *chrVSrcPtr, *(chrVSrcPtr+1),
1827                                    alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
1828                                    dest, dstW, lumAlpha, chrAlpha, dstY);
1829                 } else { //general RGB
1830                     yuv2packedX(c,
1831                                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1832                                    vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1833                                    alpSrcPtr, dest, dstW, dstY);
1834                 }
1835             }
1836         }
1837     }
1838
1839     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
1840         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
1841
1842 #if HAVE_MMX2
1843     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
1844         __asm__ volatile("sfence":::"memory");
1845 #endif
1846     emms_c();
1847
1848     /* store changed local vars back in the context */
1849     c->dstY= dstY;
1850     c->lumBufIndex= lumBufIndex;
1851     c->chrBufIndex= chrBufIndex;
1852     c->lastInLumBuf= lastInLumBuf;
1853     c->lastInChrBuf= lastInChrBuf;
1854
1855     return dstY - lastDstY;
1856 }
1857
1858 static void sws_init_swScale_c(SwsContext *c)
1859 {
1860     enum PixelFormat srcFormat = c->srcFormat;
1861
1862     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
1863                                    &c->yuv2packed1, &c->yuv2packed2,
1864                                    &c->yuv2packedX);
1865
1866     c->hScale       = hScale_c;
1867
1868     if (c->flags & SWS_FAST_BILINEAR) {
1869         c->hyscale_fast = hyscale_fast_c;
1870         c->hcscale_fast = hcscale_fast_c;
1871     }
1872
1873     c->chrToYV12 = NULL;
1874     switch(srcFormat) {
1875         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
1876         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
1877         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
1878         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
1879         case PIX_FMT_RGB8     :
1880         case PIX_FMT_BGR8     :
1881         case PIX_FMT_PAL8     :
1882         case PIX_FMT_BGR4_BYTE:
1883         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
1884         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
1885         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
1886         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
1887         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
1888         case PIX_FMT_YUV420P16BE:
1889         case PIX_FMT_YUV422P16BE:
1890         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
1891         case PIX_FMT_YUV420P16LE:
1892         case PIX_FMT_YUV422P16LE:
1893         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
1894     }
1895     if (c->chrSrcHSubSample) {
1896         switch(srcFormat) {
1897         case PIX_FMT_RGB48BE: c->chrToYV12 = rgb48BEToUV_half_c; break;
1898         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48LEToUV_half_c; break;
1899         case PIX_FMT_BGR48BE: c->chrToYV12 = bgr48BEToUV_half_c; break;
1900         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48LEToUV_half_c; break;
1901         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half_c;  break;
1902         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half_c; break;
1903         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_half_c; break;
1904         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half_c; break;
1905         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half_c; break;
1906         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half_c;  break;
1907         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half_c; break;
1908         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_half_c; break;
1909         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half_c; break;
1910         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half_c; break;
1911         }
1912     } else {
1913         switch(srcFormat) {
1914         case PIX_FMT_RGB48BE: c->chrToYV12 = rgb48BEToUV_c; break;
1915         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48LEToUV_c; break;
1916         case PIX_FMT_BGR48BE: c->chrToYV12 = bgr48BEToUV_c; break;
1917         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48LEToUV_c; break;
1918         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_c;  break;
1919         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_c; break;
1920         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_c; break;
1921         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_c; break;
1922         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_c; break;
1923         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_c;  break;
1924         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_c; break;
1925         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_c; break;
1926         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_c; break;
1927         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_c; break;
1928         }
1929     }
1930
1931     c->lumToYV12 = NULL;
1932     c->alpToYV12 = NULL;
1933     switch (srcFormat) {
1934     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
1935     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
1936     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
1937     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
1938     case PIX_FMT_YUYV422  :
1939     case PIX_FMT_YUV420P16BE:
1940     case PIX_FMT_YUV422P16BE:
1941     case PIX_FMT_YUV444P16BE:
1942     case PIX_FMT_Y400A    :
1943     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
1944     case PIX_FMT_UYVY422  :
1945     case PIX_FMT_YUV420P16LE:
1946     case PIX_FMT_YUV422P16LE:
1947     case PIX_FMT_YUV444P16LE:
1948     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c; break;
1949     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c; break;
1950     case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY_c; break;
1951     case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY_c; break;
1952     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c; break;
1953     case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY_c; break;
1954     case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY_c; break;
1955     case PIX_FMT_RGB8     :
1956     case PIX_FMT_BGR8     :
1957     case PIX_FMT_PAL8     :
1958     case PIX_FMT_BGR4_BYTE:
1959     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
1960     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
1961     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
1962     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
1963     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
1964     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
1965     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
1966     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
1967     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
1968     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
1969     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
1970     }
1971     if (c->alpPixBuf) {
1972         switch (srcFormat) {
1973         case PIX_FMT_BGRA:
1974         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
1975         case PIX_FMT_ABGR:
1976         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
1977         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
1978         }
1979     }
1980
1981     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
1982         if (c->srcRange) {
1983             c->lumConvertRange = lumRangeFromJpeg_c;
1984             c->chrConvertRange = chrRangeFromJpeg_c;
1985         } else {
1986             c->lumConvertRange = lumRangeToJpeg_c;
1987             c->chrConvertRange = chrRangeToJpeg_c;
1988         }
1989     }
1990
1991     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
1992           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
1993         c->needs_hcscale = 1;
1994 }
1995
1996 SwsFunc ff_getSwsFunc(SwsContext *c)
1997 {
1998     sws_init_swScale_c(c);
1999
2000     if (HAVE_MMX)
2001         ff_sws_init_swScale_mmx(c);
2002     if (HAVE_ALTIVEC)
2003         ff_sws_init_swScale_altivec(c);
2004
2005     return swScale;
2006 }