git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 201                       int chrFilterSize, const int16_t **alpSrc,
 202                       uint16_t *dest, uint16_t *uDest, uint16_t *vDest,
 203                       uint16_t *aDest, int dstW, int chrDstW,
 204                       int big_endian, int output_bits)
 205 {
 206     //FIXME Optimize (just quickly written not optimized..)
 207     int i;
 208     int shift = 11 + 16 - output_bits;
 209
 210 #define output_pixel(pos, val) \
 211     if (big_endian) { \
 212         if (output_bits == 16) { \
 213             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 214         } else { \
 215             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 216         } \
 217     } else { \
 218         if (output_bits == 16) { \
 219             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 220         } else { \
 221             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 222         } \
 223     }
 224     for (i = 0; i < dstW; i++) {
 225         int val = 1 << (26-output_bits);
 226         int j;
 227
 228         for (j = 0; j < lumFilterSize; j++)
 229             val += lumSrc[j][i] * lumFilter[j];
 230
 231         output_pixel(&dest[i], val);
 232     }
 233
 234     if (uDest) {
 235         for (i = 0; i < chrDstW; i++) {
 236             int u = 1 << (26-output_bits);
 237             int v = 1 << (26-output_bits);
 238             int j;
 239
 240             for (j = 0; j < chrFilterSize; j++) {
 241                 u += chrUSrc[j][i] * chrFilter[j];
 242                 v += chrVSrc[j][i] * chrFilter[j];
 243             }
 244
 245             output_pixel(&uDest[i], u);
 246             output_pixel(&vDest[i], v);
 247         }
 248     }
 249
 250     if (CONFIG_SWSCALE_ALPHA && aDest) {
 251         for (i = 0; i < dstW; i++) {
 252             int val = 1 << (26-output_bits);
 253             int j;
 254
 255             for (j = 0; j < lumFilterSize; j++)
 256                 val += alpSrc[j][i] * lumFilter[j];
 257
 258             output_pixel(&aDest[i], val);
 259         }
 260     }
 261 }
 262
 263 #define yuv2NBPS(bits, BE_LE, is_be) \
 264 static void yuv2yuvX ## bits ## BE_LE ## _c(const int16_t *lumFilter, \
 265                               const int16_t **lumSrc, int lumFilterSize, \
 266                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 267                               const int16_t **chrVSrc, \
 268                               int chrFilterSize, const int16_t **alpSrc, \
 269                               uint16_t *dest, uint16_t *uDest, uint16_t *vDest, \
 270                               uint16_t *aDest, int dstW, int chrDstW) \
 271 { \
 272     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 273                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 274                           alpSrc, \
 275                           dest, uDest, vDest, aDest, \
 276                           dstW, chrDstW, is_be, bits); \
 277 }
 278 yuv2NBPS( 9, BE, 1);
 279 yuv2NBPS( 9, LE, 0);
 280 yuv2NBPS(10, BE, 1);
 281 yuv2NBPS(10, LE, 0);
 282 yuv2NBPS(16, BE, 1);
 283 yuv2NBPS(16, LE, 0);
 284
 285 static inline void yuv2yuvX16_c(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 286                                 const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize,
 287                                 const int16_t **alpSrc, uint16_t *dest, uint16_t *uDest, uint16_t *vDest, uint16_t *aDest, int dstW, int chrDstW,
 288                                 enum PixelFormat dstFormat)
 289 {
 290 #define conv16(bits) \
 291     if (isBE(dstFormat)) { \
 292         yuv2yuvX ## bits ## BE_c(lumFilter, lumSrc, lumFilterSize, \
 293                                chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 294                                alpSrc, \
 295                                dest, uDest, vDest, aDest, \
 296                                dstW, chrDstW); \
 297     } else { \
 298         yuv2yuvX ## bits ## LE_c(lumFilter, lumSrc, lumFilterSize, \
 299                                chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 300                                alpSrc, \
 301                                dest, uDest, vDest, aDest, \
 302                                dstW, chrDstW); \
 303     }
 304     if (is16BPS(dstFormat)) {
 305         conv16(16);
 306     } else if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
 307         conv16(9);
 308     } else {
 309         conv16(10);
 310     }
 311 #undef conv16
 312 }
 313
 314 static inline void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 315                               const int16_t **lumSrc, int lumFilterSize,
 316                               const int16_t *chrFilter, const int16_t **chrUSrc,
 317                               const int16_t **chrVSrc,
 318                               int chrFilterSize, const int16_t **alpSrc,
 319                               uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 320                               uint8_t *aDest, int dstW, int chrDstW)
 321 {
 322     //FIXME Optimize (just quickly written not optimized..)
 323     int i;
 324     for (i=0; i<dstW; i++) {
 325         int val=1<<18;
 326         int j;
 327         for (j=0; j<lumFilterSize; j++)
 328             val += lumSrc[j][i] * lumFilter[j];
 329
 330         dest[i]= av_clip_uint8(val>>19);
 331     }
 332
 333     if (uDest)
 334         for (i=0; i<chrDstW; i++) {
 335             int u=1<<18;
 336             int v=1<<18;
 337             int j;
 338             for (j=0; j<chrFilterSize; j++) {
 339                 u += chrUSrc[j][i] * chrFilter[j];
 340                 v += chrVSrc[j][i] * chrFilter[j];
 341             }
 342
 343             uDest[i]= av_clip_uint8(u>>19);
 344             vDest[i]= av_clip_uint8(v>>19);
 345         }
 346
 347     if (CONFIG_SWSCALE_ALPHA && aDest)
 348         for (i=0; i<dstW; i++) {
 349             int val=1<<18;
 350             int j;
 351             for (j=0; j<lumFilterSize; j++)
 352                 val += alpSrc[j][i] * lumFilter[j];
 353
 354             aDest[i]= av_clip_uint8(val>>19);
 355         }
 356
 357 }
 358
 359 static inline void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 360                                const int16_t **lumSrc, int lumFilterSize,
 361                                const int16_t *chrFilter, const int16_t **chrUSrc,
 362                                const int16_t **chrVSrc,
 363                                int chrFilterSize, uint8_t *dest, uint8_t *uDest,
 364                                int dstW, int chrDstW, enum PixelFormat dstFormat)
 365 {
 366     //FIXME Optimize (just quickly written not optimized..)
 367     int i;
 368     for (i=0; i<dstW; i++) {
 369         int val=1<<18;
 370         int j;
 371         for (j=0; j<lumFilterSize; j++)
 372             val += lumSrc[j][i] * lumFilter[j];
 373
 374         dest[i]= av_clip_uint8(val>>19);
 375     }
 376
 377     if (!uDest)
 378         return;
 379
 380     if (dstFormat == PIX_FMT_NV12)
 381         for (i=0; i<chrDstW; i++) {
 382             int u=1<<18;
 383             int v=1<<18;
 384             int j;
 385             for (j=0; j<chrFilterSize; j++) {
 386                 u += chrUSrc[j][i] * chrFilter[j];
 387                 v += chrVSrc[j][i] * chrFilter[j];
 388             }
 389
 390             uDest[2*i]= av_clip_uint8(u>>19);
 391             uDest[2*i+1]= av_clip_uint8(v>>19);
 392         }
 393     else
 394         for (i=0; i<chrDstW; i++) {
 395             int u=1<<18;
 396             int v=1<<18;
 397             int j;
 398             for (j=0; j<chrFilterSize; j++) {
 399                 u += chrUSrc[j][i] * chrFilter[j];
 400                 v += chrVSrc[j][i] * chrFilter[j];
 401             }
 402
 403             uDest[2*i]= av_clip_uint8(v>>19);
 404             uDest[2*i+1]= av_clip_uint8(u>>19);
 405         }
 406 }
 407
 408 #define YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha) \
 409     for (i=0; i<(dstW>>1); i++) {\
 410         int j;\
 411         int Y1 = 1<<18;\
 412         int Y2 = 1<<18;\
 413         int U  = 1<<18;\
 414         int V  = 1<<18;\
 415         int av_unused A1, A2;\
 416         type av_unused *r, *b, *g;\
 417         const int i2= 2*i;\
 418         \
 419         for (j=0; j<lumFilterSize; j++) {\
 420             Y1 += lumSrc[j][i2] * lumFilter[j];\
 421             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 422         }\
 423         for (j=0; j<chrFilterSize; j++) {\
 424             U += chrUSrc[j][i] * chrFilter[j];\
 425             V += chrVSrc[j][i] * chrFilter[j];\
 426         }\
 427         Y1>>=19;\
 428         Y2>>=19;\
 429         U >>=19;\
 430         V >>=19;\
 431         if (alpha) {\
 432             A1 = 1<<18;\
 433             A2 = 1<<18;\
 434             for (j=0; j<lumFilterSize; j++) {\
 435                 A1 += alpSrc[j][i2  ] * lumFilter[j];\
 436                 A2 += alpSrc[j][i2+1] * lumFilter[j];\
 437             }\
 438             A1>>=19;\
 439             A2>>=19;\
 440         }
 441
 442 #define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
 443         YSCALE_YUV_2_PACKEDX_NOCLIP_C(type,alpha)\
 444         if ((Y1|Y2|U|V)&256) {\
 445             if (Y1>255)   Y1=255; \
 446             else if (Y1<0)Y1=0;   \
 447             if (Y2>255)   Y2=255; \
 448             else if (Y2<0)Y2=0;   \
 449             if (U>255)    U=255;  \
 450             else if (U<0) U=0;    \
 451             if (V>255)    V=255;  \
 452             else if (V<0) V=0;    \
 453         }\
 454         if (alpha && ((A1|A2)&256)) {\
 455             A1=av_clip_uint8(A1);\
 456             A2=av_clip_uint8(A2);\
 457         }
 458
 459 #define YSCALE_YUV_2_PACKEDX_FULL_C(rnd,alpha) \
 460     for (i=0; i<dstW; i++) {\
 461         int j;\
 462         int Y = 0;\
 463         int U = -128<<19;\
 464         int V = -128<<19;\
 465         int av_unused A;\
 466         int R,G,B;\
 467         \
 468         for (j=0; j<lumFilterSize; j++) {\
 469             Y += lumSrc[j][i     ] * lumFilter[j];\
 470         }\
 471         for (j=0; j<chrFilterSize; j++) {\
 472             U += chrUSrc[j][i] * chrFilter[j];\
 473             V += chrVSrc[j][i] * chrFilter[j];\
 474         }\
 475         Y >>=10;\
 476         U >>=10;\
 477         V >>=10;\
 478         if (alpha) {\
 479             A = rnd;\
 480             for (j=0; j<lumFilterSize; j++)\
 481                 A += alpSrc[j][i     ] * lumFilter[j];\
 482             A >>=19;\
 483             if (A&256)\
 484                 A = av_clip_uint8(A);\
 485         }
 486
 487 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
 488     YSCALE_YUV_2_PACKEDX_FULL_C(rnd>>3,alpha)\
 489         Y-= c->yuv2rgb_y_offset;\
 490         Y*= c->yuv2rgb_y_coeff;\
 491         Y+= rnd;\
 492         R= Y + V*c->yuv2rgb_v2r_coeff;\
 493         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
 494         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
 495         if ((R|G|B)&(0xC0000000)) {\
 496             if (R>=(256<<22))   R=(256<<22)-1; \
 497             else if (R<0)R=0;   \
 498             if (G>=(256<<22))   G=(256<<22)-1; \
 499             else if (G<0)G=0;   \
 500             if (B>=(256<<22))   B=(256<<22)-1; \
 501             else if (B<0)B=0;   \
 502         }
 503
 504 #define YSCALE_YUV_2_GRAY16_C \
 505     for (i=0; i<(dstW>>1); i++) {\
 506         int j;\
 507         int Y1 = 1<<18;\
 508         int Y2 = 1<<18;\
 509         int U  = 1<<18;\
 510         int V  = 1<<18;\
 511         \
 512         const int i2= 2*i;\
 513         \
 514         for (j=0; j<lumFilterSize; j++) {\
 515             Y1 += lumSrc[j][i2] * lumFilter[j];\
 516             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 517         }\
 518         Y1>>=11;\
 519         Y2>>=11;\
 520         if ((Y1|Y2|U|V)&65536) {\
 521             if (Y1>65535)   Y1=65535; \
 522             else if (Y1<0)Y1=0;   \
 523             if (Y2>65535)   Y2=65535; \
 524             else if (Y2<0)Y2=0;   \
 525         }
 526
 527 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
 528     YSCALE_YUV_2_PACKEDX_C(type,alpha)  /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
 529     r = (type *)c->table_rV[V];   \
 530     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
 531     b = (type *)c->table_bU[U];
 532
 533 #define YSCALE_YUV_2_PACKED2_C(type,alpha)   \
 534     for (i=0; i<(dstW>>1); i++) { \
 535         const int i2= 2*i;       \
 536         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>19;           \
 537         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;           \
 538         int U= (ubuf0[i]*uvalpha1+ubuf1[i]*uvalpha)>>19;              \
 539         int V= (vbuf0[i]*uvalpha1+vbuf1[i]*uvalpha)>>19;              \
 540         type av_unused *r, *b, *g;                                    \
 541         int av_unused A1, A2;                                         \
 542         if (alpha) {\
 543             A1= (abuf0[i2  ]*yalpha1+abuf1[i2  ]*yalpha)>>19;         \
 544             A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
 545         }
 546
 547 #define YSCALE_YUV_2_GRAY16_2_C   \
 548     for (i=0; i<(dstW>>1); i++) { \
 549         const int i2= 2*i;       \
 550         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>11;           \
 551         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>11;
 552
 553 #define YSCALE_YUV_2_RGB2_C(type,alpha) \
 554     YSCALE_YUV_2_PACKED2_C(type,alpha)\
 555     r = (type *)c->table_rV[V];\
 556     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 557     b = (type *)c->table_bU[U];
 558
 559 #define YSCALE_YUV_2_PACKED1_C(type,alpha) \
 560     for (i=0; i<(dstW>>1); i++) {\
 561         const int i2= 2*i;\
 562         int Y1= buf0[i2  ]>>7;\
 563         int Y2= buf0[i2+1]>>7;\
 564         int U= (ubuf1[i])>>7;\
 565         int V= (vbuf1[i])>>7;\
 566         type av_unused *r, *b, *g;\
 567         int av_unused A1, A2;\
 568         if (alpha) {\
 569             A1= abuf0[i2  ]>>7;\
 570             A2= abuf0[i2+1]>>7;\
 571         }
 572
 573 #define YSCALE_YUV_2_GRAY16_1_C \
 574     for (i=0; i<(dstW>>1); i++) {\
 575         const int i2= 2*i;\
 576         int Y1= buf0[i2  ]<<1;\
 577         int Y2= buf0[i2+1]<<1;
 578
 579 #define YSCALE_YUV_2_RGB1_C(type,alpha) \
 580     YSCALE_YUV_2_PACKED1_C(type,alpha)\
 581     r = (type *)c->table_rV[V];\
 582     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 583     b = (type *)c->table_bU[U];
 584
 585 #define YSCALE_YUV_2_PACKED1B_C(type,alpha) \
 586     for (i=0; i<(dstW>>1); i++) {\
 587         const int i2= 2*i;\
 588         int Y1= buf0[i2  ]>>7;\
 589         int Y2= buf0[i2+1]>>7;\
 590         int U= (ubuf0[i] + ubuf1[i])>>8;\
 591         int V= (vbuf0[i] + vbuf1[i])>>8;\
 592         type av_unused *r, *b, *g;\
 593         int av_unused A1, A2;\
 594         if (alpha) {\
 595             A1= abuf0[i2  ]>>7;\
 596             A2= abuf0[i2+1]>>7;\
 597         }
 598
 599 #define YSCALE_YUV_2_RGB1B_C(type,alpha) \
 600     YSCALE_YUV_2_PACKED1B_C(type,alpha)\
 601     r = (type *)c->table_rV[V];\
 602     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 603     b = (type *)c->table_bU[U];
 604
 605 #define YSCALE_YUV_2_MONO2_C \
 606     const uint8_t * const d128=dither_8x8_220[y&7];\
 607     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 608     for (i=0; i<dstW-7; i+=8) {\
 609         int acc;\
 610         acc =       g[((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19) + d128[0]];\
 611         acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
 612         acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
 613         acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
 614         acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
 615         acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
 616         acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
 617         acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
 618         ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 619         dest++;\
 620     }
 621
 622 #define YSCALE_YUV_2_MONOX_C \
 623     const uint8_t * const d128=dither_8x8_220[y&7];\
 624     uint8_t *g= c->table_gU[128] + c->table_gV[128];\
 625     int acc=0;\
 626     for (i=0; i<dstW-1; i+=2) {\
 627         int j;\
 628         int Y1=1<<18;\
 629         int Y2=1<<18;\
 630 \
 631         for (j=0; j<lumFilterSize; j++) {\
 632             Y1 += lumSrc[j][i] * lumFilter[j];\
 633             Y2 += lumSrc[j][i+1] * lumFilter[j];\
 634         }\
 635         Y1>>=19;\
 636         Y2>>=19;\
 637         if ((Y1|Y2)&256) {\
 638             if (Y1>255)   Y1=255;\
 639             else if (Y1<0)Y1=0;\
 640             if (Y2>255)   Y2=255;\
 641             else if (Y2<0)Y2=0;\
 642         }\
 643         acc+= acc + g[Y1+d128[(i+0)&7]];\
 644         acc+= acc + g[Y2+d128[(i+1)&7]];\
 645         if ((i&7)==6) {\
 646             ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
 647             dest++;\
 648         }\
 649     }
 650
 651 #define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
 652     switch(c->dstFormat) {\
 653     case PIX_FMT_RGB48BE:\
 654     case PIX_FMT_RGB48LE:\
 655         func(uint8_t,0)\
 656             ((uint8_t*)dest)[ 0]= r[Y1];\
 657             ((uint8_t*)dest)[ 1]= r[Y1];\
 658             ((uint8_t*)dest)[ 2]= g[Y1];\
 659             ((uint8_t*)dest)[ 3]= g[Y1];\
 660             ((uint8_t*)dest)[ 4]= b[Y1];\
 661             ((uint8_t*)dest)[ 5]= b[Y1];\
 662             ((uint8_t*)dest)[ 6]= r[Y2];\
 663             ((uint8_t*)dest)[ 7]= r[Y2];\
 664             ((uint8_t*)dest)[ 8]= g[Y2];\
 665             ((uint8_t*)dest)[ 9]= g[Y2];\
 666             ((uint8_t*)dest)[10]= b[Y2];\
 667             ((uint8_t*)dest)[11]= b[Y2];\
 668             dest+=12;\
 669         }\
 670         break;\
 671     case PIX_FMT_BGR48BE:\
 672     case PIX_FMT_BGR48LE:\
 673         func(uint8_t,0)\
 674             ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
 675             ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
 676             ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
 677             ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
 678             ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
 679             ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
 680             dest+=12;\
 681         }\
 682         break;\
 683     case PIX_FMT_RGBA:\
 684     case PIX_FMT_BGRA:\
 685         if (CONFIG_SMALL) {\
 686             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 687             func(uint32_t,needAlpha)\
 688                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? (A1<<24) : 0);\
 689                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? (A2<<24) : 0);\
 690             }\
 691         } else {\
 692             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 693                 func(uint32_t,1)\
 694                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
 695                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
 696                 }\
 697             } else {\
 698                 func(uint32_t,0)\
 699                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 700                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 701                 }\
 702             }\
 703         }\
 704         break;\
 705     case PIX_FMT_ARGB:\
 706     case PIX_FMT_ABGR:\
 707         if (CONFIG_SMALL) {\
 708             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 709             func(uint32_t,needAlpha)\
 710                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? A1 : 0);\
 711                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? A2 : 0);\
 712             }\
 713         } else {\
 714             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 715                 func(uint32_t,1)\
 716                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + A1;\
 717                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + A2;\
 718                 }\
 719             } else {\
 720                 func(uint32_t,0)\
 721                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 722                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 723                 }\
 724             }\
 725         }                \
 726         break;\
 727     case PIX_FMT_RGB24:\
 728         func(uint8_t,0)\
 729             ((uint8_t*)dest)[0]= r[Y1];\
 730             ((uint8_t*)dest)[1]= g[Y1];\
 731             ((uint8_t*)dest)[2]= b[Y1];\
 732             ((uint8_t*)dest)[3]= r[Y2];\
 733             ((uint8_t*)dest)[4]= g[Y2];\
 734             ((uint8_t*)dest)[5]= b[Y2];\
 735             dest+=6;\
 736         }\
 737         break;\
 738     case PIX_FMT_BGR24:\
 739         func(uint8_t,0)\
 740             ((uint8_t*)dest)[0]= b[Y1];\
 741             ((uint8_t*)dest)[1]= g[Y1];\
 742             ((uint8_t*)dest)[2]= r[Y1];\
 743             ((uint8_t*)dest)[3]= b[Y2];\
 744             ((uint8_t*)dest)[4]= g[Y2];\
 745             ((uint8_t*)dest)[5]= r[Y2];\
 746             dest+=6;\
 747         }\
 748         break;\
 749     case PIX_FMT_RGB565BE:\
 750     case PIX_FMT_RGB565LE:\
 751     case PIX_FMT_BGR565BE:\
 752     case PIX_FMT_BGR565LE:\
 753         {\
 754             const int dr1= dither_2x2_8[y&1    ][0];\
 755             const int dg1= dither_2x2_4[y&1    ][0];\
 756             const int db1= dither_2x2_8[(y&1)^1][0];\
 757             const int dr2= dither_2x2_8[y&1    ][1];\
 758             const int dg2= dither_2x2_4[y&1    ][1];\
 759             const int db2= dither_2x2_8[(y&1)^1][1];\
 760             func(uint16_t,0)\
 761                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 762                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 763             }\
 764         }\
 765         break;\
 766     case PIX_FMT_RGB555BE:\
 767     case PIX_FMT_RGB555LE:\
 768     case PIX_FMT_BGR555BE:\
 769     case PIX_FMT_BGR555LE:\
 770         {\
 771             const int dr1= dither_2x2_8[y&1    ][0];\
 772             const int dg1= dither_2x2_8[y&1    ][1];\
 773             const int db1= dither_2x2_8[(y&1)^1][0];\
 774             const int dr2= dither_2x2_8[y&1    ][1];\
 775             const int dg2= dither_2x2_8[y&1    ][0];\
 776             const int db2= dither_2x2_8[(y&1)^1][1];\
 777             func(uint16_t,0)\
 778                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 779                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 780             }\
 781         }\
 782         break;\
 783     case PIX_FMT_RGB444BE:\
 784     case PIX_FMT_RGB444LE:\
 785     case PIX_FMT_BGR444BE:\
 786     case PIX_FMT_BGR444LE:\
 787         {\
 788             const int dr1= dither_4x4_16[y&3    ][0];\
 789             const int dg1= dither_4x4_16[y&3    ][1];\
 790             const int db1= dither_4x4_16[(y&3)^3][0];\
 791             const int dr2= dither_4x4_16[y&3    ][1];\
 792             const int dg2= dither_4x4_16[y&3    ][0];\
 793             const int db2= dither_4x4_16[(y&3)^3][1];\
 794             func(uint16_t,0)\
 795                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 796                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 797             }\
 798         }\
 799         break;\
 800     case PIX_FMT_RGB8:\
 801     case PIX_FMT_BGR8:\
 802         {\
 803             const uint8_t * const d64= dither_8x8_73[y&7];\
 804             const uint8_t * const d32= dither_8x8_32[y&7];\
 805             func(uint8_t,0)\
 806                 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
 807                 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
 808             }\
 809         }\
 810         break;\
 811     case PIX_FMT_RGB4:\
 812     case PIX_FMT_BGR4:\
 813         {\
 814             const uint8_t * const d64= dither_8x8_73 [y&7];\
 815             const uint8_t * const d128=dither_8x8_220[y&7];\
 816             func(uint8_t,0)\
 817                 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
 818                                  + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
 819             }\
 820         }\
 821         break;\
 822     case PIX_FMT_RGB4_BYTE:\
 823     case PIX_FMT_BGR4_BYTE:\
 824         {\
 825             const uint8_t * const d64= dither_8x8_73 [y&7];\
 826             const uint8_t * const d128=dither_8x8_220[y&7];\
 827             func(uint8_t,0)\
 828                 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
 829                 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
 830             }\
 831         }\
 832         break;\
 833     case PIX_FMT_MONOBLACK:\
 834     case PIX_FMT_MONOWHITE:\
 835         {\
 836             func_monoblack\
 837         }\
 838         break;\
 839     case PIX_FMT_YUYV422:\
 840         func2\
 841             ((uint8_t*)dest)[2*i2+0]= Y1;\
 842             ((uint8_t*)dest)[2*i2+1]= U;\
 843             ((uint8_t*)dest)[2*i2+2]= Y2;\
 844             ((uint8_t*)dest)[2*i2+3]= V;\
 845         }                \
 846         break;\
 847     case PIX_FMT_UYVY422:\
 848         func2\
 849             ((uint8_t*)dest)[2*i2+0]= U;\
 850             ((uint8_t*)dest)[2*i2+1]= Y1;\
 851             ((uint8_t*)dest)[2*i2+2]= V;\
 852             ((uint8_t*)dest)[2*i2+3]= Y2;\
 853         }                \
 854         break;\
 855     case PIX_FMT_GRAY16BE:\
 856         func_g16\
 857             ((uint8_t*)dest)[2*i2+0]= Y1>>8;\
 858             ((uint8_t*)dest)[2*i2+1]= Y1;\
 859             ((uint8_t*)dest)[2*i2+2]= Y2>>8;\
 860             ((uint8_t*)dest)[2*i2+3]= Y2;\
 861         }                \
 862         break;\
 863     case PIX_FMT_GRAY16LE:\
 864         func_g16\
 865             ((uint8_t*)dest)[2*i2+0]= Y1;\
 866             ((uint8_t*)dest)[2*i2+1]= Y1>>8;\
 867             ((uint8_t*)dest)[2*i2+2]= Y2;\
 868             ((uint8_t*)dest)[2*i2+3]= Y2>>8;\
 869         }                \
 870         break;\
 871     }
 872
 873 static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
 874                           const int16_t **lumSrc, int lumFilterSize,
 875                           const int16_t *chrFilter, const int16_t **chrUSrc,
 876                           const int16_t **chrVSrc, int chrFilterSize,
 877                           const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 878 {
 879     int i;
 880     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
 881 }
 882
 883 static inline void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
 884                                    const int16_t **lumSrc, int lumFilterSize,
 885                                    const int16_t *chrFilter, const int16_t **chrUSrc,
 886                                    const int16_t **chrVSrc, int chrFilterSize,
 887                                    const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 888 {
 889     int i;
 890     int step= c->dstFormatBpp/8;
 891     int aidx= 3;
 892
 893     switch(c->dstFormat) {
 894     case PIX_FMT_ARGB:
 895         dest++;
 896         aidx= 0;
 897     case PIX_FMT_RGB24:
 898         aidx--;
 899     case PIX_FMT_RGBA:
 900         if (CONFIG_SMALL) {
 901             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 902             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 903                 dest[aidx]= needAlpha ? A : 255;
 904                 dest[0]= R>>22;
 905                 dest[1]= G>>22;
 906                 dest[2]= B>>22;
 907                 dest+= step;
 908             }
 909         } else {
 910             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 911                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 912                     dest[aidx]= A;
 913                     dest[0]= R>>22;
 914                     dest[1]= G>>22;
 915                     dest[2]= B>>22;
 916                     dest+= step;
 917                 }
 918             } else {
 919                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 920                     dest[aidx]= 255;
 921                     dest[0]= R>>22;
 922                     dest[1]= G>>22;
 923                     dest[2]= B>>22;
 924                     dest+= step;
 925                 }
 926             }
 927         }
 928         break;
 929     case PIX_FMT_ABGR:
 930         dest++;
 931         aidx= 0;
 932     case PIX_FMT_BGR24:
 933         aidx--;
 934     case PIX_FMT_BGRA:
 935         if (CONFIG_SMALL) {
 936             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
 937             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
 938                 dest[aidx]= needAlpha ? A : 255;
 939                 dest[0]= B>>22;
 940                 dest[1]= G>>22;
 941                 dest[2]= R>>22;
 942                 dest+= step;
 943             }
 944         } else {
 945             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 946                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
 947                     dest[aidx]= A;
 948                     dest[0]= B>>22;
 949                     dest[1]= G>>22;
 950                     dest[2]= R>>22;
 951                     dest+= step;
 952                 }
 953             } else {
 954                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
 955                     dest[aidx]= 255;
 956                     dest[0]= B>>22;
 957                     dest[1]= G>>22;
 958                     dest[2]= R>>22;
 959                     dest+= step;
 960                 }
 961             }
 962         }
 963         break;
 964     default:
 965         assert(0);
 966     }
 967 }
 968
 969 static void fillPlane(uint8_t* plane, int stride, int width, int height, int y, uint8_t val)
 970 {
 971     int i;
 972     uint8_t *ptr = plane + stride*y;
 973     for (i=0; i<height; i++) {
 974         memset(ptr, val, width);
 975         ptr += stride;
 976     }
 977 }
 978
 979 static void rgb48ToY_c(uint8_t *dst, const uint8_t *src, int width,
 980                        uint32_t *unused)
 981 {
 982     int i;
 983     for (i = 0; i < width; i++) {
 984         int r = src[i*6+0];
 985         int g = src[i*6+2];
 986         int b = src[i*6+4];
 987
 988         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
 989     }
 990 }
 991
 992 static void rgb48ToUV_c(uint8_t *dstU, uint8_t *dstV,
 993                         const uint8_t *src1, const uint8_t *src2,
 994                         int width, uint32_t *unused)
 995 {
 996     int i;
 997     assert(src1==src2);
 998     for (i = 0; i < width; i++) {
 999         int r = src1[6*i + 0];
1000         int g = src1[6*i + 2];
1001         int b = src1[6*i + 4];
1002
1003         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1004         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1005     }
1006 }
1007
1008 static void rgb48ToUV_half_c(uint8_t *dstU, uint8_t *dstV,
1009                              const uint8_t *src1, const uint8_t *src2,
1010                              int width, uint32_t *unused)
1011 {
1012     int i;
1013     assert(src1==src2);
1014     for (i = 0; i < width; i++) {
1015         int r= src1[12*i + 0] + src1[12*i + 6];
1016         int g= src1[12*i + 2] + src1[12*i + 8];
1017         int b= src1[12*i + 4] + src1[12*i + 10];
1018
1019         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1020         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1021     }
1022 }
1023
1024 static void bgr48ToY_c(uint8_t *dst, const uint8_t *src, int width,
1025                        uint32_t *unused)
1026 {
1027     int i;
1028     for (i = 0; i < width; i++) {
1029         int b = src[i*6+0];
1030         int g = src[i*6+2];
1031         int r = src[i*6+4];
1032
1033         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1034     }
1035 }
1036
1037 static void bgr48ToUV_c(uint8_t *dstU, uint8_t *dstV,
1038                         const uint8_t *src1, const uint8_t *src2,
1039                         int width, uint32_t *unused)
1040 {
1041     int i;
1042     for (i = 0; i < width; i++) {
1043         int b = src1[6*i + 0];
1044         int g = src1[6*i + 2];
1045         int r = src1[6*i + 4];
1046
1047         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1048         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1049     }
1050 }
1051
1052 static void bgr48ToUV_half_c(uint8_t *dstU, uint8_t *dstV,
1053                              const uint8_t *src1, const uint8_t *src2,
1054                              int width, uint32_t *unused)
1055 {
1056     int i;
1057     for (i = 0; i < width; i++) {
1058         int b= src1[12*i + 0] + src1[12*i + 6];
1059         int g= src1[12*i + 2] + src1[12*i + 8];
1060         int r= src1[12*i + 4] + src1[12*i + 10];
1061
1062         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1063         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1064     }
1065 }
1066
1067 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1068 static void name ## _c(uint8_t *dst, const uint8_t *src, \
1069                        int width, uint32_t *unused)\
1070 {\
1071     int i;\
1072     for (i=0; i<width; i++) {\
1073         int b= (((const type*)src)[i]>>shb)&maskb;\
1074         int g= (((const type*)src)[i]>>shg)&maskg;\
1075         int r= (((const type*)src)[i]>>shr)&maskr;\
1076 \
1077         dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1078     }\
1079 }
1080
1081 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1082 BGR2Y(uint32_t,bgr321ToY,16,16, 0, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1083 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1084 BGR2Y(uint32_t,rgb321ToY, 0,16,16, 0xFF00, 0x00FF, 0xFF00, RY    , GY<<8, BY    , RGB2YUV_SHIFT+8)
1085 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1086 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1087 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1088 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1089
1090 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1091 {
1092     int i;
1093     for (i=0; i<width; i++) {
1094         dst[i]= src[4*i];
1095     }
1096 }
1097
1098 #define BGR2UV(type, name, shr, shg, shb, shp, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S) \
1099 static void name ## _c(uint8_t *dstU, uint8_t *dstV, \
1100                        const uint8_t *src, const uint8_t *dummy, \
1101                        int width, uint32_t *unused)\
1102 {\
1103     int i;\
1104     for (i=0; i<width; i++) {\
1105         int b= ((((const type*)src)[i]>>shp)&maskb)>>shb;\
1106         int g= ((((const type*)src)[i]>>shp)&maskg)>>shg;\
1107         int r= ((((const type*)src)[i]>>shp)&maskr)>>shr;\
1108 \
1109         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1110         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1111     }\
1112 }\
1113 static void name ## _half_c(uint8_t *dstU, uint8_t *dstV, \
1114                             const uint8_t *src, const uint8_t *dummy, \
1115                             int width, uint32_t *unused)\
1116 {\
1117     int i;\
1118     for (i=0; i<width; i++) {\
1119         int pix0= ((const type*)src)[2*i+0]>>shp;\
1120         int pix1= ((const type*)src)[2*i+1]>>shp;\
1121         int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1122         int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1123         int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1124         g&= maskg|(2*maskg);\
1125 \
1126         g>>=shg;\
1127 \
1128         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1129         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1130     }\
1131 }
1132
1133 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1134 BGR2UV(uint32_t,bgr321ToUV,16, 0, 0, 8, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1135 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1136 BGR2UV(uint32_t,rgb321ToUV, 0, 0,16, 8,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1137 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1138 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1139 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1140 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1141
1142 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1143 {
1144     int i;
1145     for (i=0; i<width; i++) {
1146         int d= src[i];
1147
1148         dst[i]= pal[d] & 0xFF;
1149     }
1150 }
1151
1152 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1153                       const uint8_t *src1, const uint8_t *src2,
1154                       int width, uint32_t *pal)
1155 {
1156     int i;
1157     assert(src1 == src2);
1158     for (i=0; i<width; i++) {
1159         int p= pal[src1[i]];
1160
1161         dstU[i]= p>>8;
1162         dstV[i]= p>>16;
1163     }
1164 }
1165
1166 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1167                           int width, uint32_t *unused)
1168 {
1169     int i, j;
1170     for (i=0; i<width/8; i++) {
1171         int d= ~src[i];
1172         for(j=0; j<8; j++)
1173             dst[8*i+j]= ((d>>(7-j))&1)*255;
1174     }
1175 }
1176
1177 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1178                           int width, uint32_t *unused)
1179 {
1180     int i, j;
1181     for (i=0; i<width/8; i++) {
1182         int d= src[i];
1183         for(j=0; j<8; j++)
1184             dst[8*i+j]= ((d>>(7-j))&1)*255;
1185     }
1186 }
1187
1188 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
1189                        const int16_t *chrUSrc, const int16_t *chrVSrc,
1190                        const int16_t *alpSrc,
1191                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
1192                        uint8_t *aDest, int dstW, int chrDstW)
1193 {
1194     int i;
1195     for (i=0; i<dstW; i++) {
1196         int val= (lumSrc[i]+64)>>7;
1197         dest[i]= av_clip_uint8(val);
1198     }
1199
1200     if (uDest)
1201         for (i=0; i<chrDstW; i++) {
1202             int u=(chrUSrc[i]+64)>>7;
1203             int v=(chrVSrc[i]+64)>>7;
1204             uDest[i]= av_clip_uint8(u);
1205             vDest[i]= av_clip_uint8(v);
1206         }
1207
1208     if (CONFIG_SWSCALE_ALPHA && aDest)
1209         for (i=0; i<dstW; i++) {
1210             int val= (alpSrc[i]+64)>>7;
1211             aDest[i]= av_clip_uint8(val);
1212         }
1213 }
1214
1215 /**
1216  * vertical bilinear scale YV12 to RGB
1217  */
1218 static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
1219                           const uint16_t *buf1, const uint16_t *ubuf0,
1220                           const uint16_t *ubuf1, const uint16_t *vbuf0,
1221                           const uint16_t *vbuf1, const uint16_t *abuf0,
1222                           const uint16_t *abuf1, uint8_t *dest, int dstW,
1223                           int yalpha, int uvalpha, int y)
1224 {
1225     int  yalpha1=4095- yalpha;
1226     int uvalpha1=4095-uvalpha;
1227     int i;
1228
1229     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1230 }
1231
1232 /**
1233  * YV12 to RGB without scaling or interpolating
1234  */
1235 static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
1236                           const uint16_t *ubuf0, const uint16_t *ubuf1,
1237                           const uint16_t *vbuf0, const uint16_t *vbuf1,
1238                           const uint16_t *abuf0, uint8_t *dest, int dstW,
1239                           int uvalpha, enum PixelFormat dstFormat,
1240                           int flags, int y)
1241 {
1242     const int yalpha1=0;
1243     int i;
1244
1245     const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1246     const int yalpha= 4096; //FIXME ...
1247
1248     if (uvalpha < 2048) {
1249         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1250     } else {
1251         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1252     }
1253 }
1254
1255 //FIXME yuy2* can read up to 7 samples too much
1256
1257 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1258                       uint32_t *unused)
1259 {
1260     int i;
1261     for (i=0; i<width; i++)
1262         dst[i]= src[2*i];
1263 }
1264
1265 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1266                        const uint8_t *src2, int width, uint32_t *unused)
1267 {
1268     int i;
1269     for (i=0; i<width; i++) {
1270         dstU[i]= src1[4*i + 1];
1271         dstV[i]= src1[4*i + 3];
1272     }
1273     assert(src1 == src2);
1274 }
1275
1276 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1277                      const uint8_t *src2, int width, uint32_t *unused)
1278 {
1279     int i;
1280     for (i=0; i<width; i++) {
1281         dstU[i]= src1[2*i + 1];
1282         dstV[i]= src2[2*i + 1];
1283     }
1284 }
1285
1286 /* This is almost identical to the previous, end exists only because
1287  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1288 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1289                       uint32_t *unused)
1290 {
1291     int i;
1292     for (i=0; i<width; i++)
1293         dst[i]= src[2*i+1];
1294 }
1295
1296 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1297                        const uint8_t *src2, int width, uint32_t *unused)
1298 {
1299     int i;
1300     for (i=0; i<width; i++) {
1301         dstU[i]= src1[4*i + 0];
1302         dstV[i]= src1[4*i + 2];
1303     }
1304     assert(src1 == src2);
1305 }
1306
1307 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1308                      const uint8_t *src2, int width, uint32_t *unused)
1309 {
1310     int i;
1311     for (i=0; i<width; i++) {
1312         dstU[i]= src1[2*i];
1313         dstV[i]= src2[2*i];
1314     }
1315 }
1316
1317 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1318                                         const uint8_t *src, int width)
1319 {
1320     int i;
1321     for (i = 0; i < width; i++) {
1322         dst1[i] = src[2*i+0];
1323         dst2[i] = src[2*i+1];
1324     }
1325 }
1326
1327 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1328                        const uint8_t *src1, const uint8_t *src2,
1329                        int width, uint32_t *unused)
1330 {
1331     nvXXtoUV_c(dstU, dstV, src1, width);
1332 }
1333
1334 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1335                        const uint8_t *src1, const uint8_t *src2,
1336                        int width, uint32_t *unused)
1337 {
1338     nvXXtoUV_c(dstV, dstU, src1, width);
1339 }
1340
1341 // FIXME Maybe dither instead.
1342 #define YUV_NBPS(depth, endianness, rfunc) \
1343 static void endianness ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1344                                           const uint8_t *_srcU, const uint8_t *_srcV, \
1345                                           int width, uint32_t *unused) \
1346 { \
1347     int i; \
1348     const uint16_t *srcU = (const uint16_t*)_srcU; \
1349     const uint16_t *srcV = (const uint16_t*)_srcV; \
1350     for (i = 0; i < width; i++) { \
1351         dstU[i] = rfunc(&srcU[i])>>(depth-8); \
1352         dstV[i] = rfunc(&srcV[i])>>(depth-8); \
1353     } \
1354 } \
1355 \
1356 static void endianness ## depth ## ToY_c(uint8_t *dstY, const uint8_t *_srcY, \
1357                                          int width, uint32_t *unused) \
1358 { \
1359     int i; \
1360     const uint16_t *srcY = (const uint16_t*)_srcY; \
1361     for (i = 0; i < width; i++) \
1362         dstY[i] = rfunc(&srcY[i])>>(depth-8); \
1363 } \
1364
1365 YUV_NBPS( 9, LE, AV_RL16)
1366 YUV_NBPS( 9, BE, AV_RB16)
1367 YUV_NBPS(10, LE, AV_RL16)
1368 YUV_NBPS(10, BE, AV_RB16)
1369
1370 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1371                        int width, uint32_t *unused)
1372 {
1373     int i;
1374     for (i=0; i<width; i++) {
1375         int b= src[i*3+0];
1376         int g= src[i*3+1];
1377         int r= src[i*3+2];
1378
1379         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1380     }
1381 }
1382
1383 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1384                         const uint8_t *src2, int width, uint32_t *unused)
1385 {
1386     int i;
1387     for (i=0; i<width; i++) {
1388         int b= src1[3*i + 0];
1389         int g= src1[3*i + 1];
1390         int r= src1[3*i + 2];
1391
1392         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1393         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1394     }
1395     assert(src1 == src2);
1396 }
1397
1398 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1399                              const uint8_t *src2, int width, uint32_t *unused)
1400 {
1401     int i;
1402     for (i=0; i<width; i++) {
1403         int b= src1[6*i + 0] + src1[6*i + 3];
1404         int g= src1[6*i + 1] + src1[6*i + 4];
1405         int r= src1[6*i + 2] + src1[6*i + 5];
1406
1407         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1408         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1409     }
1410     assert(src1 == src2);
1411 }
1412
1413 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1414                        uint32_t *unused)
1415 {
1416     int i;
1417     for (i=0; i<width; i++) {
1418         int r= src[i*3+0];
1419         int g= src[i*3+1];
1420         int b= src[i*3+2];
1421
1422         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1423     }
1424 }
1425
1426 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1427                         const uint8_t *src2, int width, uint32_t *unused)
1428 {
1429     int i;
1430     assert(src1==src2);
1431     for (i=0; i<width; i++) {
1432         int r= src1[3*i + 0];
1433         int g= src1[3*i + 1];
1434         int b= src1[3*i + 2];
1435
1436         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1437         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1438     }
1439 }
1440
1441 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1442                              const uint8_t *src2, int width, uint32_t *unused)
1443 {
1444     int i;
1445     assert(src1==src2);
1446     for (i=0; i<width; i++) {
1447         int r= src1[6*i + 0] + src1[6*i + 3];
1448         int g= src1[6*i + 1] + src1[6*i + 4];
1449         int b= src1[6*i + 2] + src1[6*i + 5];
1450
1451         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1452         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1453     }
1454 }
1455
1456
1457 // bilinear / bicubic scaling
1458 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1459                      int srcW, int xInc,
1460                      const int16_t *filter, const int16_t *filterPos,
1461                      int filterSize)
1462 {
1463     int i;
1464     for (i=0; i<dstW; i++) {
1465         int j;
1466         int srcPos= filterPos[i];
1467         int val=0;
1468         for (j=0; j<filterSize; j++) {
1469             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1470         }
1471         //filter += hFilterSize;
1472         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1473         //dst[i] = val>>7;
1474     }
1475 }
1476
1477 //FIXME all pal and rgb srcFormats could do this convertion as well
1478 //FIXME all scalers more complex than bilinear could do half of this transform
1479 static void chrRangeToJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1480 {
1481     int i;
1482     for (i = 0; i < width; i++) {
1483         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1484         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1485     }
1486 }
1487 static void chrRangeFromJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1488 {
1489     int i;
1490     for (i = 0; i < width; i++) {
1491         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1492         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1493     }
1494 }
1495 static void lumRangeToJpeg_c(uint16_t *dst, int width)
1496 {
1497     int i;
1498     for (i = 0; i < width; i++)
1499         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1500 }
1501 static void lumRangeFromJpeg_c(uint16_t *dst, int width)
1502 {
1503     int i;
1504     for (i = 0; i < width; i++)
1505         dst[i] = (dst[i]*14071 + 33561947)>>14;
1506 }
1507
1508 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1509                            const uint8_t *src, int srcW, int xInc)
1510 {
1511     int i;
1512     unsigned int xpos=0;
1513     for (i=0;i<dstWidth;i++) {
1514         register unsigned int xx=xpos>>16;
1515         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1516         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1517         xpos+=xInc;
1518     }
1519 }
1520
1521 // *** horizontal scale Y line to temp buffer
1522 static inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1523                            const uint8_t *src, int srcW, int xInc,
1524                            const int16_t *hLumFilter,
1525                            const int16_t *hLumFilterPos, int hLumFilterSize,
1526                            uint8_t *formatConvBuffer,
1527                            uint32_t *pal, int isAlpha)
1528 {
1529     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1530     void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1531
1532     src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
1533
1534     if (toYV12) {
1535         toYV12(formatConvBuffer, src, srcW, pal);
1536         src= formatConvBuffer;
1537     }
1538
1539     if (!c->hyscale_fast) {
1540         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1541     } else { // fast bilinear upscale / crap downscale
1542         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1543     }
1544
1545     if (convertRange)
1546         convertRange(dst, dstWidth);
1547 }
1548
1549 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1550                            int dstWidth, const uint8_t *src1,
1551                            const uint8_t *src2, int srcW, int xInc)
1552 {
1553     int i;
1554     unsigned int xpos=0;
1555     for (i=0;i<dstWidth;i++) {
1556         register unsigned int xx=xpos>>16;
1557         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1558         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1559         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1560         xpos+=xInc;
1561     }
1562 }
1563
1564 static inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1565                            const uint8_t *src1, const uint8_t *src2,
1566                            int srcW, int xInc, const int16_t *hChrFilter,
1567                            const int16_t *hChrFilterPos, int hChrFilterSize,
1568                            uint8_t *formatConvBuffer, uint32_t *pal)
1569 {
1570
1571     src1 += c->chrSrcOffset;
1572     src2 += c->chrSrcOffset;
1573
1574     if (c->chrToYV12) {
1575         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1576         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1577         src1= formatConvBuffer;
1578         src2= buf2;
1579     }
1580
1581     if (!c->hcscale_fast) {
1582         c->hScale(dst1, dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1583         c->hScale(dst2, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1584     } else { // fast bilinear upscale / crap downscale
1585         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1586     }
1587
1588     if (c->chrConvertRange)
1589         c->chrConvertRange(dst1, dst2, dstWidth);
1590 }
1591
1592 #define DEBUG_SWSCALE_BUFFERS 0
1593 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
1594
1595 static int swScale(SwsContext *c, const uint8_t* src[],
1596                    int srcStride[], int srcSliceY,
1597                    int srcSliceH, uint8_t* dst[], int dstStride[])
1598 {
1599     /* load a few things into local vars to make the code more readable? and faster */
1600     const int srcW= c->srcW;
1601     const int dstW= c->dstW;
1602     const int dstH= c->dstH;
1603     const int chrDstW= c->chrDstW;
1604     const int chrSrcW= c->chrSrcW;
1605     const int lumXInc= c->lumXInc;
1606     const int chrXInc= c->chrXInc;
1607     const enum PixelFormat dstFormat= c->dstFormat;
1608     const int flags= c->flags;
1609     int16_t *vLumFilterPos= c->vLumFilterPos;
1610     int16_t *vChrFilterPos= c->vChrFilterPos;
1611     int16_t *hLumFilterPos= c->hLumFilterPos;
1612     int16_t *hChrFilterPos= c->hChrFilterPos;
1613     int16_t *vLumFilter= c->vLumFilter;
1614     int16_t *vChrFilter= c->vChrFilter;
1615     int16_t *hLumFilter= c->hLumFilter;
1616     int16_t *hChrFilter= c->hChrFilter;
1617     int32_t *lumMmxFilter= c->lumMmxFilter;
1618     int32_t *chrMmxFilter= c->chrMmxFilter;
1619     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
1620     const int vLumFilterSize= c->vLumFilterSize;
1621     const int vChrFilterSize= c->vChrFilterSize;
1622     const int hLumFilterSize= c->hLumFilterSize;
1623     const int hChrFilterSize= c->hChrFilterSize;
1624     int16_t **lumPixBuf= c->lumPixBuf;
1625     int16_t **chrUPixBuf= c->chrUPixBuf;
1626     int16_t **chrVPixBuf= c->chrVPixBuf;
1627     int16_t **alpPixBuf= c->alpPixBuf;
1628     const int vLumBufSize= c->vLumBufSize;
1629     const int vChrBufSize= c->vChrBufSize;
1630     uint8_t *formatConvBuffer= c->formatConvBuffer;
1631     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
1632     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
1633     int lastDstY;
1634     uint32_t *pal=c->pal_yuv;
1635
1636     /* vars which will change and which we need to store back in the context */
1637     int dstY= c->dstY;
1638     int lumBufIndex= c->lumBufIndex;
1639     int chrBufIndex= c->chrBufIndex;
1640     int lastInLumBuf= c->lastInLumBuf;
1641     int lastInChrBuf= c->lastInChrBuf;
1642
1643     if (isPacked(c->srcFormat)) {
1644         src[0]=
1645         src[1]=
1646         src[2]=
1647         src[3]= src[0];
1648         srcStride[0]=
1649         srcStride[1]=
1650         srcStride[2]=
1651         srcStride[3]= srcStride[0];
1652     }
1653     srcStride[1]<<= c->vChrDrop;
1654     srcStride[2]<<= c->vChrDrop;
1655
1656     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
1657                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
1658                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
1659     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
1660                    srcSliceY,    srcSliceH,    dstY,    dstH);
1661     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
1662                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
1663
1664     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
1665         static int warnedAlready=0; //FIXME move this into the context perhaps
1666         if (flags & SWS_PRINT_INFO && !warnedAlready) {
1667             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
1668                    "         ->cannot do aligned memory accesses anymore\n");
1669             warnedAlready=1;
1670         }
1671     }
1672
1673     /* Note the user might start scaling the picture in the middle so this
1674        will not get executed. This is not really intended but works
1675        currently, so people might do it. */
1676     if (srcSliceY ==0) {
1677         lumBufIndex=-1;
1678         chrBufIndex=-1;
1679         dstY=0;
1680         lastInLumBuf= -1;
1681         lastInChrBuf= -1;
1682     }
1683
1684     lastDstY= dstY;
1685
1686     for (;dstY < dstH; dstY++) {
1687         unsigned char *dest =dst[0]+dstStride[0]*dstY;
1688         const int chrDstY= dstY>>c->chrDstVSubSample;
1689         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
1690         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
1691         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
1692
1693         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
1694         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
1695         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
1696         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
1697         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
1698         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
1699         int enough_lines;
1700
1701         //handle holes (FAST_BILINEAR & weird filters)
1702         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1703         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1704         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
1705         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
1706
1707         DEBUG_BUFFERS("dstY: %d\n", dstY);
1708         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
1709                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
1710         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
1711                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
1712
1713         // Do we have enough lines in this slice to output the dstY line
1714         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
1715
1716         if (!enough_lines) {
1717             lastLumSrcY = srcSliceY + srcSliceH - 1;
1718             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
1719             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
1720                                             lastLumSrcY, lastChrSrcY);
1721         }
1722
1723         //Do horizontal scaling
1724         while(lastInLumBuf < lastLumSrcY) {
1725             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
1726             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
1727             lumBufIndex++;
1728             assert(lumBufIndex < 2*vLumBufSize);
1729             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
1730             assert(lastInLumBuf + 1 - srcSliceY >= 0);
1731             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
1732                     hLumFilter, hLumFilterPos, hLumFilterSize,
1733                     formatConvBuffer,
1734                     pal, 0);
1735             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
1736                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
1737                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
1738                         formatConvBuffer,
1739                         pal, 1);
1740             lastInLumBuf++;
1741             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
1742                                lumBufIndex,    lastInLumBuf);
1743         }
1744         while(lastInChrBuf < lastChrSrcY) {
1745             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
1746             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
1747             chrBufIndex++;
1748             assert(chrBufIndex < 2*vChrBufSize);
1749             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
1750             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
1751             //FIXME replace parameters through context struct (some at least)
1752
1753             if (c->needs_hcscale)
1754                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
1755                           chrDstW, src1, src2, chrSrcW, chrXInc,
1756                           hChrFilter, hChrFilterPos, hChrFilterSize,
1757                           formatConvBuffer, pal);
1758             lastInChrBuf++;
1759             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
1760                                chrBufIndex,    lastInChrBuf);
1761         }
1762         //wrap buf index around to stay inside the ring buffer
1763         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
1764         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
1765         if (!enough_lines)
1766             break; //we can't output a dstY line so let's try with the next slice
1767
1768 #if HAVE_MMX
1769         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
1770 #endif
1771         if (dstY < dstH-2) {
1772             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1773             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1774             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1775             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1776             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1777                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1778                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
1779                 c->yuv2nv12X(c,
1780                              vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1781                              vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1782                              dest, uDest, dstW, chrDstW, dstFormat);
1783             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
1784                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1785                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1786                 if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
1787                     yuv2yuvX16_c(vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1788                                  vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
1789                                  chrVSrcPtr, vChrFilterSize,
1790                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest,
1791                                  (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
1792                                  dstFormat);
1793                 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
1794                     const int16_t *lumBuf = lumSrcPtr[0];
1795                     const int16_t *chrUBuf= chrUSrcPtr[0];
1796                     const int16_t *chrVBuf= chrVSrcPtr[0];
1797                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
1798                     c->yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
1799                                 uDest, vDest, aDest, dstW, chrDstW);
1800                 } else { //General YV12
1801                     c->yuv2yuvX(c,
1802                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1803                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
1804                                 chrVSrcPtr, vChrFilterSize,
1805                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
1806                 }
1807             } else {
1808                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
1809                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1810                 if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
1811                     int chrAlpha= vChrFilter[2*dstY+1];
1812                     if(flags & SWS_FULL_CHR_H_INT) {
1813                         yuv2rgbX_c_full(c, //FIXME write a packed1_full function
1814                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1815                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr,
1816                                         chrVSrcPtr, vChrFilterSize,
1817                                         alpSrcPtr, dest, dstW, dstY);
1818                     } else {
1819                         c->yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
1820                                        *chrVSrcPtr, *(chrVSrcPtr+1),
1821                                        alpPixBuf ? *alpSrcPtr : NULL,
1822                                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
1823                     }
1824                 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
1825                     int lumAlpha= vLumFilter[2*dstY+1];
1826                     int chrAlpha= vChrFilter[2*dstY+1];
1827                     lumMmxFilter[2]=
1828                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
1829                     chrMmxFilter[2]=
1830                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
1831                     if(flags & SWS_FULL_CHR_H_INT) {
1832                         yuv2rgbX_c_full(c, //FIXME write a packed2_full function
1833                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1834                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1835                                         alpSrcPtr, dest, dstW, dstY);
1836                     } else {
1837                         c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
1838                                        *chrVSrcPtr, *(chrVSrcPtr+1),
1839                                        alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
1840                                        dest, dstW, lumAlpha, chrAlpha, dstY);
1841                     }
1842                 } else { //general RGB
1843                     if(flags & SWS_FULL_CHR_H_INT) {
1844                         yuv2rgbX_c_full(c,
1845                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1846                                         vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1847                                         alpSrcPtr, dest, dstW, dstY);
1848                     } else {
1849                         c->yuv2packedX(c,
1850                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1851                                        vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1852                                        alpSrcPtr, dest, dstW, dstY);
1853                     }
1854                 }
1855             }
1856         } else { // hmm looks like we can't use MMX here without overwriting this array's tail
1857             const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1858             const int16_t **chrUSrcPtr= (const int16_t **)chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1859             const int16_t **chrVSrcPtr= (const int16_t **)chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1860             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1861             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1862                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1863                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
1864                 yuv2nv12X_c(c, vLumFilter+dstY*vLumFilterSize,
1865                             lumSrcPtr, vLumFilterSize,
1866                             vChrFilter+chrDstY*vChrFilterSize,
1867                             chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1868                             dest, uDest, dstW, chrDstW, dstFormat);
1869             } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
1870                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1871                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1872                 if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
1873                     yuv2yuvX16_c(vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1874                                  vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1875                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
1876                                  dstFormat);
1877                 } else {
1878                     yuv2yuvX_c(c, vLumFilter+dstY*vLumFilterSize,
1879                                lumSrcPtr, vLumFilterSize,
1880                                vChrFilter+chrDstY*vChrFilterSize,
1881                                chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1882                                alpSrcPtr, dest, uDest, vDest, aDest,
1883                                dstW, chrDstW);
1884                 }
1885             } else {
1886                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
1887                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1888                 if(flags & SWS_FULL_CHR_H_INT) {
1889                     yuv2rgbX_c_full(c,
1890                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1891                                     vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1892                                     alpSrcPtr, dest, dstW, dstY);
1893                 } else {
1894                     yuv2packedX_c(c,
1895                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
1896                                   vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
1897                                   alpSrcPtr, dest, dstW, dstY);
1898                 }
1899             }
1900         }
1901     }
1902
1903     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
1904         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
1905
1906 #if HAVE_MMX2
1907     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
1908         __asm__ volatile("sfence":::"memory");
1909 #endif
1910     emms_c();
1911
1912     /* store changed local vars back in the context */
1913     c->dstY= dstY;
1914     c->lumBufIndex= lumBufIndex;
1915     c->chrBufIndex= chrBufIndex;
1916     c->lastInLumBuf= lastInLumBuf;
1917     c->lastInChrBuf= lastInChrBuf;
1918
1919     return dstY - lastDstY;
1920 }
1921
1922 static void sws_init_swScale_c(SwsContext *c)
1923 {
1924     enum PixelFormat srcFormat = c->srcFormat;
1925
1926     c->yuv2nv12X    = yuv2nv12X_c;
1927     c->yuv2yuv1     = yuv2yuv1_c;
1928     c->yuv2yuvX     = yuv2yuvX_c;
1929     c->yuv2packed1  = yuv2packed1_c;
1930     c->yuv2packed2  = yuv2packed2_c;
1931     c->yuv2packedX  = yuv2packedX_c;
1932
1933     c->hScale       = hScale_c;
1934
1935     if (c->flags & SWS_FAST_BILINEAR)
1936     {
1937         c->hyscale_fast = hyscale_fast_c;
1938         c->hcscale_fast = hcscale_fast_c;
1939     }
1940
1941     c->chrToYV12 = NULL;
1942     switch(srcFormat) {
1943         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
1944         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
1945         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
1946         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
1947         case PIX_FMT_RGB8     :
1948         case PIX_FMT_BGR8     :
1949         case PIX_FMT_PAL8     :
1950         case PIX_FMT_BGR4_BYTE:
1951         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
1952         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
1953         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
1954         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
1955         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
1956         case PIX_FMT_YUV420P16BE:
1957         case PIX_FMT_YUV422P16BE:
1958         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
1959         case PIX_FMT_YUV420P16LE:
1960         case PIX_FMT_YUV422P16LE:
1961         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
1962     }
1963     if (c->chrSrcHSubSample) {
1964         switch(srcFormat) {
1965         case PIX_FMT_RGB48BE:
1966         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half_c; break;
1967         case PIX_FMT_BGR48BE:
1968         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half_c; break;
1969         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half_c;  break;
1970         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half_c; break;
1971         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_half_c; break;
1972         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half_c; break;
1973         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half_c; break;
1974         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half_c;  break;
1975         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half_c; break;
1976         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_half_c; break;
1977         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half_c; break;
1978         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half_c; break;
1979         }
1980     } else {
1981         switch(srcFormat) {
1982         case PIX_FMT_RGB48BE:
1983         case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_c; break;
1984         case PIX_FMT_BGR48BE:
1985         case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_c; break;
1986         case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_c;  break;
1987         case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_c; break;
1988         case PIX_FMT_BGR24  : c->chrToYV12 = bgr24ToUV_c; break;
1989         case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_c; break;
1990         case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_c; break;
1991         case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_c;  break;
1992         case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_c; break;
1993         case PIX_FMT_RGB24  : c->chrToYV12 = rgb24ToUV_c; break;
1994         case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_c; break;
1995         case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_c; break;
1996         }
1997     }
1998
1999     c->lumToYV12 = NULL;
2000     c->alpToYV12 = NULL;
2001     switch (srcFormat) {
2002     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
2003     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
2004     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
2005     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
2006     case PIX_FMT_YUYV422  :
2007     case PIX_FMT_YUV420P16BE:
2008     case PIX_FMT_YUV422P16BE:
2009     case PIX_FMT_YUV444P16BE:
2010     case PIX_FMT_Y400A    :
2011     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
2012     case PIX_FMT_UYVY422  :
2013     case PIX_FMT_YUV420P16LE:
2014     case PIX_FMT_YUV422P16LE:
2015     case PIX_FMT_YUV444P16LE:
2016     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c; break;
2017     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c; break;
2018     case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY_c; break;
2019     case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY_c; break;
2020     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c; break;
2021     case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY_c; break;
2022     case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY_c; break;
2023     case PIX_FMT_RGB8     :
2024     case PIX_FMT_BGR8     :
2025     case PIX_FMT_PAL8     :
2026     case PIX_FMT_BGR4_BYTE:
2027     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2028     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2029     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2030     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2031     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2032     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2033     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2034     case PIX_FMT_RGB48BE:
2035     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY_c; break;
2036     case PIX_FMT_BGR48BE:
2037     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY_c; break;
2038     }
2039     if (c->alpPixBuf) {
2040         switch (srcFormat) {
2041         case PIX_FMT_RGB32  :
2042         case PIX_FMT_RGB32_1:
2043         case PIX_FMT_BGR32  :
2044         case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA_c; break;
2045         case PIX_FMT_Y400A  : c->alpToYV12 = yuy2ToY_c; break;
2046         }
2047     }
2048
2049     switch (srcFormat) {
2050     case PIX_FMT_Y400A  :
2051         c->alpSrcOffset = 1;
2052         break;
2053     case PIX_FMT_RGB32  :
2054     case PIX_FMT_BGR32  :
2055         c->alpSrcOffset = 3;
2056         break;
2057     case PIX_FMT_RGB48LE:
2058     case PIX_FMT_BGR48LE:
2059         c->lumSrcOffset = 1;
2060         c->chrSrcOffset = 1;
2061         c->alpSrcOffset = 1;
2062         break;
2063     }
2064
2065     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2066         if (c->srcRange) {
2067             c->lumConvertRange = lumRangeFromJpeg_c;
2068             c->chrConvertRange = chrRangeFromJpeg_c;
2069         } else {
2070             c->lumConvertRange = lumRangeToJpeg_c;
2071             c->chrConvertRange = chrRangeToJpeg_c;
2072         }
2073     }
2074
2075     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2076           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2077         c->needs_hcscale = 1;
2078 }
2079
2080 SwsFunc ff_getSwsFunc(SwsContext *c)
2081 {
2082     sws_init_swScale_c(c);
2083
2084     if (HAVE_MMX)
2085         ff_sws_init_swScale_mmx(c);
2086     if (HAVE_ALTIVEC)
2087         ff_sws_init_swScale_altivec(c);
2088
2089     return swScale;
2090 }