git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 201                       int chrFilterSize, const int16_t **alpSrc,
 202                       uint16_t *dest[4], int dstW, int chrDstW,
 203                       int big_endian, int output_bits)
 204 {
 205     //FIXME Optimize (just quickly written not optimized..)
 206     int i;
 207     int shift = 11 + 16 - output_bits;
 208     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 209              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 210
 211 #define output_pixel(pos, val) \
 212     if (big_endian) { \
 213         if (output_bits == 16) { \
 214             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 215         } else { \
 216             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 217         } \
 218     } else { \
 219         if (output_bits == 16) { \
 220             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 221         } else { \
 222             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 223         } \
 224     }
 225     for (i = 0; i < dstW; i++) {
 226         int val = 1 << (26-output_bits);
 227         int j;
 228
 229         for (j = 0; j < lumFilterSize; j++)
 230             val += lumSrc[j][i] * lumFilter[j];
 231
 232         output_pixel(&yDest[i], val);
 233     }
 234
 235     if (uDest) {
 236         for (i = 0; i < chrDstW; i++) {
 237             int u = 1 << (26-output_bits);
 238             int v = 1 << (26-output_bits);
 239             int j;
 240
 241             for (j = 0; j < chrFilterSize; j++) {
 242                 u += chrUSrc[j][i] * chrFilter[j];
 243                 v += chrVSrc[j][i] * chrFilter[j];
 244             }
 245
 246             output_pixel(&uDest[i], u);
 247             output_pixel(&vDest[i], v);
 248         }
 249     }
 250
 251     if (CONFIG_SWSCALE_ALPHA && aDest) {
 252         for (i = 0; i < dstW; i++) {
 253             int val = 1 << (26-output_bits);
 254             int j;
 255
 256             for (j = 0; j < lumFilterSize; j++)
 257                 val += alpSrc[j][i] * lumFilter[j];
 258
 259             output_pixel(&aDest[i], val);
 260         }
 261     }
 262 #undef output_pixel
 263 }
 264
 265 #define yuv2NBPS(bits, BE_LE, is_be) \
 266 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 267                               const int16_t **lumSrc, int lumFilterSize, \
 268                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 269                               const int16_t **chrVSrc, \
 270                               int chrFilterSize, const int16_t **alpSrc, \
 271                               uint8_t *_dest[4], int dstW, int chrDstW) \
 272 { \
 273     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 274                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 275                           alpSrc, (uint16_t **) _dest, \
 276                           dstW, chrDstW, is_be, bits); \
 277 }
 278 yuv2NBPS( 9, BE, 1);
 279 yuv2NBPS( 9, LE, 0);
 280 yuv2NBPS(10, BE, 1);
 281 yuv2NBPS(10, LE, 0);
 282 yuv2NBPS(16, BE, 1);
 283 yuv2NBPS(16, LE, 0);
 284
 285 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 286                        const int16_t **lumSrc, int lumFilterSize,
 287                        const int16_t *chrFilter, const int16_t **chrUSrc,
 288                        const int16_t **chrVSrc,
 289                        int chrFilterSize, const int16_t **alpSrc,
 290                        uint8_t *dest[4], int dstW, int chrDstW)
 291 {
 292     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 293             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 294     int i;
 295
 296     //FIXME Optimize (just quickly written not optimized..)
 297     for (i=0; i<dstW; i++) {
 298         int val=1<<18;
 299         int j;
 300         for (j=0; j<lumFilterSize; j++)
 301             val += lumSrc[j][i] * lumFilter[j];
 302
 303         yDest[i]= av_clip_uint8(val>>19);
 304     }
 305
 306     if (uDest)
 307         for (i=0; i<chrDstW; i++) {
 308             int u=1<<18;
 309             int v=1<<18;
 310             int j;
 311             for (j=0; j<chrFilterSize; j++) {
 312                 u += chrUSrc[j][i] * chrFilter[j];
 313                 v += chrVSrc[j][i] * chrFilter[j];
 314             }
 315
 316             uDest[i]= av_clip_uint8(u>>19);
 317             vDest[i]= av_clip_uint8(v>>19);
 318         }
 319
 320     if (CONFIG_SWSCALE_ALPHA && aDest)
 321         for (i=0; i<dstW; i++) {
 322             int val=1<<18;
 323             int j;
 324             for (j=0; j<lumFilterSize; j++)
 325                 val += alpSrc[j][i] * lumFilter[j];
 326
 327             aDest[i]= av_clip_uint8(val>>19);
 328         }
 329 }
 330
 331 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 332                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 333                        const int16_t *alpSrc,
 334                        uint8_t *dest[4], int dstW, int chrDstW)
 335 {
 336     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 337             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 338     int i;
 339
 340     for (i=0; i<dstW; i++) {
 341         int val= (lumSrc[i]+64)>>7;
 342         yDest[i]= av_clip_uint8(val);
 343     }
 344
 345     if (uDest)
 346         for (i=0; i<chrDstW; i++) {
 347             int u=(chrUSrc[i]+64)>>7;
 348             int v=(chrVSrc[i]+64)>>7;
 349             uDest[i]= av_clip_uint8(u);
 350             vDest[i]= av_clip_uint8(v);
 351         }
 352
 353     if (CONFIG_SWSCALE_ALPHA && aDest)
 354         for (i=0; i<dstW; i++) {
 355             int val= (alpSrc[i]+64)>>7;
 356             aDest[i]= av_clip_uint8(val);
 357         }
 358 }
 359
 360 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 361                         const int16_t **lumSrc, int lumFilterSize,
 362                         const int16_t *chrFilter, const int16_t **chrUSrc,
 363                         const int16_t **chrVSrc, int chrFilterSize,
 364                         const int16_t **alpSrc, uint8_t *dest[4],
 365                         int dstW, int chrDstW)
 366 {
 367     uint8_t *yDest = dest[0], *uDest = dest[1];
 368     enum PixelFormat dstFormat = c->dstFormat;
 369
 370     //FIXME Optimize (just quickly written not optimized..)
 371     int i;
 372     for (i=0; i<dstW; i++) {
 373         int val=1<<18;
 374         int j;
 375         for (j=0; j<lumFilterSize; j++)
 376             val += lumSrc[j][i] * lumFilter[j];
 377
 378         yDest[i]= av_clip_uint8(val>>19);
 379     }
 380
 381     if (!uDest)
 382         return;
 383
 384     if (dstFormat == PIX_FMT_NV12)
 385         for (i=0; i<chrDstW; i++) {
 386             int u=1<<18;
 387             int v=1<<18;
 388             int j;
 389             for (j=0; j<chrFilterSize; j++) {
 390                 u += chrUSrc[j][i] * chrFilter[j];
 391                 v += chrVSrc[j][i] * chrFilter[j];
 392             }
 393
 394             uDest[2*i]= av_clip_uint8(u>>19);
 395             uDest[2*i+1]= av_clip_uint8(v>>19);
 396         }
 397     else
 398         for (i=0; i<chrDstW; i++) {
 399             int u=1<<18;
 400             int v=1<<18;
 401             int j;
 402             for (j=0; j<chrFilterSize; j++) {
 403                 u += chrUSrc[j][i] * chrFilter[j];
 404                 v += chrVSrc[j][i] * chrFilter[j];
 405             }
 406
 407             uDest[2*i]= av_clip_uint8(v>>19);
 408             uDest[2*i+1]= av_clip_uint8(u>>19);
 409         }
 410 }
 411
 412 #define output_pixel(pos, val) \
 413         if (target == PIX_FMT_GRAY16BE) { \
 414             AV_WB16(pos, val); \
 415         } else { \
 416             AV_WL16(pos, val); \
 417         }
 418
 419 static av_always_inline void
 420 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 421                         const int16_t **lumSrc, int lumFilterSize,
 422                         const int16_t *chrFilter, const int16_t **chrUSrc,
 423                         const int16_t **chrVSrc, int chrFilterSize,
 424                         const int16_t **alpSrc, uint8_t *dest, int dstW,
 425                         int y, enum PixelFormat target)
 426 {
 427     int i;
 428
 429     for (i = 0; i < (dstW >> 1); i++) {
 430         int j;
 431         int Y1 = 1 << 18;
 432         int Y2 = 1 << 18;
 433         const int i2 = 2 * i;
 434
 435         for (j = 0; j < lumFilterSize; j++) {
 436             Y1 += lumSrc[j][i2]   * lumFilter[j];
 437             Y2 += lumSrc[j][i2+1] * lumFilter[j];
 438         }
 439         Y1 >>= 11;
 440         Y2 >>= 11;
 441         if ((Y1 | Y2) & 0x10000) {
 442             Y1 = av_clip_uint16(Y1);
 443             Y2 = av_clip_uint16(Y2);
 444         }
 445         output_pixel(&dest[2 * i2 + 0], Y1);
 446         output_pixel(&dest[2 * i2 + 2], Y2);
 447     }
 448 }
 449
 450 static av_always_inline void
 451 yuv2gray16_2_c_template(SwsContext *c, const int16_t *buf[2],
 452                         const int16_t *ubuf[2], const int16_t *vbuf[2],
 453                         const int16_t *abuf[2], uint8_t *dest, int dstW,
 454                         int yalpha, int uvalpha, int y,
 455                         enum PixelFormat target)
 456 {
 457     int  yalpha1 = 4095 - yalpha;
 458     int i;
 459     const int16_t *buf0 = buf[0], *buf1 = buf[1];
 460
 461     for (i = 0; i < (dstW >> 1); i++) {
 462         const int i2 = 2 * i;
 463         int Y1 = (buf0[i2  ] * yalpha1 + buf1[i2  ] * yalpha) >> 11;
 464         int Y2 = (buf0[i2+1] * yalpha1 + buf1[i2+1] * yalpha) >> 11;
 465
 466         output_pixel(&dest[2 * i2 + 0], Y1);
 467         output_pixel(&dest[2 * i2 + 2], Y2);
 468     }
 469 }
 470
 471 static av_always_inline void
 472 yuv2gray16_1_c_template(SwsContext *c, const int16_t *buf0,
 473                         const int16_t *ubuf[2], const int16_t *vbuf[2],
 474                         const int16_t *abuf0, uint8_t *dest, int dstW,
 475                         int uvalpha, int y, enum PixelFormat target)
 476 {
 477     int i;
 478
 479     for (i = 0; i < (dstW >> 1); i++) {
 480         const int i2 = 2 * i;
 481         int Y1 = buf0[i2  ] << 1;
 482         int Y2 = buf0[i2+1] << 1;
 483
 484         output_pixel(&dest[2 * i2 + 0], Y1);
 485         output_pixel(&dest[2 * i2 + 2], Y2);
 486     }
 487 }
 488
 489 #undef output_pixel
 490
 491 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 492 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 493                         const int16_t **lumSrc, int lumFilterSize, \
 494                         const int16_t *chrFilter, const int16_t **chrUSrc, \
 495                         const int16_t **chrVSrc, int chrFilterSize, \
 496                         const int16_t **alpSrc, uint8_t *dest, int dstW, \
 497                         int y) \
 498 { \
 499     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 500                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 501                           alpSrc, dest, dstW, y, fmt); \
 502 } \
 503  \
 504 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 505                         const int16_t *ubuf[2], const int16_t *vbuf[2], \
 506                         const int16_t *abuf[2], uint8_t *dest, int dstW, \
 507                         int yalpha, int uvalpha, int y) \
 508 { \
 509     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 510                           dest, dstW, yalpha, uvalpha, y, fmt); \
 511 } \
 512  \
 513 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 514                         const int16_t *ubuf[2], const int16_t *vbuf[2], \
 515                         const int16_t *abuf0, uint8_t *dest, int dstW, \
 516                         int uvalpha, int y) \
 517 { \
 518     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 519                                   dstW, uvalpha, y, fmt); \
 520 }
 521
 522 YUV2PACKEDWRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 523 YUV2PACKEDWRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 524
 525 #define output_pixel(pos, acc) \
 526     if (target == PIX_FMT_MONOBLACK) { \
 527         pos = acc; \
 528     } else { \
 529         pos = ~acc; \
 530     }
 531
 532 static av_always_inline void
 533 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 534                       const int16_t **lumSrc, int lumFilterSize,
 535                       const int16_t *chrFilter, const int16_t **chrUSrc,
 536                       const int16_t **chrVSrc, int chrFilterSize,
 537                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 538                       int y, enum PixelFormat target)
 539 {
 540     const uint8_t * const d128=dither_8x8_220[y&7];
 541     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 542     int i;
 543     int acc = 0;
 544
 545     for (i = 0; i < dstW - 1; i += 2) {
 546         int j;
 547         int Y1 = 1 << 18;
 548         int Y2 = 1 << 18;
 549
 550         for (j = 0; j < lumFilterSize; j++) {
 551             Y1 += lumSrc[j][i]   * lumFilter[j];
 552             Y2 += lumSrc[j][i+1] * lumFilter[j];
 553         }
 554         Y1 >>= 19;
 555         Y2 >>= 19;
 556         if ((Y1 | Y2) & 0x100) {
 557             Y1 = av_clip_uint8(Y1);
 558             Y2 = av_clip_uint8(Y2);
 559         }
 560         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 561         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 562         if ((i & 7) == 6) {
 563             output_pixel(*dest++, acc);
 564         }
 565     }
 566 }
 567
 568 static av_always_inline void
 569 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 570                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 571                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 572                       int yalpha, int uvalpha, int y,
 573                       enum PixelFormat target)
 574 {
 575     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 576     const uint8_t * const d128 = dither_8x8_220[y & 7];
 577     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 578     int  yalpha1 = 4095 - yalpha;
 579     int i;
 580
 581     for (i = 0; i < dstW - 7; i += 8) {
 582         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 583         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 584         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 585         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 586         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 587         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 588         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 589         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 590         output_pixel(*dest++, acc);
 591     }
 592 }
 593
 594 static av_always_inline void
 595 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 596                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 597                       const int16_t *abuf0, uint8_t *dest, int dstW,
 598                       int uvalpha, int y, enum PixelFormat target)
 599 {
 600     const uint8_t * const d128 = dither_8x8_220[y & 7];
 601     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 602     int i;
 603
 604     for (i = 0; i < dstW - 7; i += 8) {
 605         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 606         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 607         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 608         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 609         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 610         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 611         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 612         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 613         output_pixel(*dest++, acc);
 614     }
 615 }
 616
 617 #undef output_pixel
 618
 619 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 620 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 621
 622 #define output_pixels(pos, Y1, U, Y2, V) \
 623     if (target == PIX_FMT_YUYV422) { \
 624         dest[pos + 0] = Y1; \
 625         dest[pos + 1] = U;  \
 626         dest[pos + 2] = Y2; \
 627         dest[pos + 3] = V;  \
 628     } else { \
 629         dest[pos + 0] = U;  \
 630         dest[pos + 1] = Y1; \
 631         dest[pos + 2] = V;  \
 632         dest[pos + 3] = Y2; \
 633     }
 634
 635 static av_always_inline void
 636 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 637                      const int16_t **lumSrc, int lumFilterSize,
 638                      const int16_t *chrFilter, const int16_t **chrUSrc,
 639                      const int16_t **chrVSrc, int chrFilterSize,
 640                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 641                      int y, enum PixelFormat target)
 642 {
 643     int i;
 644
 645     for (i = 0; i < (dstW >> 1); i++) {
 646         int j;
 647         int Y1 = 1 << 18;
 648         int Y2 = 1 << 18;
 649         int U  = 1 << 18;
 650         int V  = 1 << 18;
 651
 652         for (j = 0; j < lumFilterSize; j++) {
 653             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 654             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 655         }
 656         for (j = 0; j < chrFilterSize; j++) {
 657             U += chrUSrc[j][i] * chrFilter[j];
 658             V += chrVSrc[j][i] * chrFilter[j];
 659         }
 660         Y1 >>= 19;
 661         Y2 >>= 19;
 662         U  >>= 19;
 663         V  >>= 19;
 664         if ((Y1 | Y2 | U | V) & 0x100) {
 665             Y1 = av_clip_uint8(Y1);
 666             Y2 = av_clip_uint8(Y2);
 667             U  = av_clip_uint8(U);
 668             V  = av_clip_uint8(V);
 669         }
 670         output_pixels(4*i, Y1, U, Y2, V);
 671     }
 672 }
 673
 674 static av_always_inline void
 675 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 676                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 677                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 678                      int yalpha, int uvalpha, int y,
 679                      enum PixelFormat target)
 680 {
 681     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 682                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 683                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 684     int  yalpha1 = 4095 - yalpha;
 685     int uvalpha1 = 4095 - uvalpha;
 686     int i;
 687
 688     for (i = 0; i < (dstW >> 1); i++) {
 689         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 690         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 691         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 692         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 693
 694         output_pixels(i * 4, Y1, U, Y2, V);
 695     }
 696 }
 697
 698 static av_always_inline void
 699 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 700                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 701                      const int16_t *abuf0, uint8_t *dest, int dstW,
 702                      int uvalpha, int y, enum PixelFormat target)
 703 {
 704     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 705                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 706     int i;
 707
 708     if (uvalpha < 2048) {
 709         for (i = 0; i < (dstW >> 1); i++) {
 710             int Y1 = buf0[i * 2]     >> 7;
 711             int Y2 = buf0[i * 2 + 1] >> 7;
 712             int U  = ubuf1[i]        >> 7;
 713             int V  = vbuf1[i]        >> 7;
 714
 715             output_pixels(i * 4, Y1, U, Y2, V);
 716         }
 717     } else {
 718         for (i = 0; i < (dstW >> 1); i++) {
 719             int Y1 =  buf0[i * 2]          >> 7;
 720             int Y2 =  buf0[i * 2 + 1]      >> 7;
 721             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 722             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 723
 724             output_pixels(i * 4, Y1, U, Y2, V);
 725         }
 726     }
 727 }
 728
 729 #undef output_pixels
 730
 731 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 732 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 733
 734 #define r_b ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? r : b)
 735 #define b_r ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? b : r)
 736
 737 static av_always_inline void
 738 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 739                        const int16_t **lumSrc, int lumFilterSize,
 740                        const int16_t *chrFilter, const int16_t **chrUSrc,
 741                        const int16_t **chrVSrc, int chrFilterSize,
 742                        const int16_t **alpSrc, uint8_t *dest, int dstW,
 743                        int y, enum PixelFormat target)
 744 {
 745     int i;
 746
 747     for (i = 0; i < (dstW >> 1); i++) {
 748         int j;
 749         int Y1 = 1 << 18;
 750         int Y2 = 1 << 18;
 751         int U  = 1 << 18;
 752         int V  = 1 << 18;
 753         const uint8_t *r, *g, *b;
 754
 755         for (j = 0; j < lumFilterSize; j++) {
 756             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 757             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 758         }
 759         for (j = 0; j < chrFilterSize; j++) {
 760             U += chrUSrc[j][i] * chrFilter[j];
 761             V += chrVSrc[j][i] * chrFilter[j];
 762         }
 763         Y1 >>= 19;
 764         Y2 >>= 19;
 765         U  >>= 19;
 766         V  >>= 19;
 767         if ((Y1 | Y2 | U | V) & 0x100) {
 768             Y1 = av_clip_uint8(Y1);
 769             Y2 = av_clip_uint8(Y2);
 770             U  = av_clip_uint8(U);
 771             V  = av_clip_uint8(V);
 772         }
 773
 774         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
 775         r = (const uint8_t *) c->table_rV[V];
 776         g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]);
 777         b = (const uint8_t *) c->table_bU[U];
 778
 779         dest[ 0] = dest[ 1] = r_b[Y1];
 780         dest[ 2] = dest[ 3] =   g[Y1];
 781         dest[ 4] = dest[ 5] = b_r[Y1];
 782         dest[ 6] = dest[ 7] = r_b[Y2];
 783         dest[ 8] = dest[ 9] =   g[Y2];
 784         dest[10] = dest[11] = b_r[Y2];
 785         dest += 12;
 786     }
 787 }
 788
 789 static av_always_inline void
 790 yuv2rgb48_2_c_template(SwsContext *c, const int16_t *buf[2],
 791                        const int16_t *ubuf[2], const int16_t *vbuf[2],
 792                        const int16_t *abuf[2], uint8_t *dest, int dstW,
 793                        int yalpha, int uvalpha, int y,
 794                        enum PixelFormat target)
 795 {
 796     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 797                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 798                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 799     int  yalpha1 = 4095 - yalpha;
 800     int uvalpha1 = 4095 - uvalpha;
 801     int i;
 802
 803     for (i = 0; i < (dstW >> 1); i++) {
 804         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 805         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 806         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 807         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 808         const uint8_t *r = (const uint8_t *) c->table_rV[V],
 809                       *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 810                       *b = (const uint8_t *) c->table_bU[U];
 811
 812         dest[ 0] = dest[ 1] = r_b[Y1];
 813         dest[ 2] = dest[ 3] =   g[Y1];
 814         dest[ 4] = dest[ 5] = b_r[Y1];
 815         dest[ 6] = dest[ 7] = r_b[Y2];
 816         dest[ 8] = dest[ 9] =   g[Y2];
 817         dest[10] = dest[11] = b_r[Y2];
 818         dest += 12;
 819     }
 820 }
 821
 822 static av_always_inline void
 823 yuv2rgb48_1_c_template(SwsContext *c, const int16_t *buf0,
 824                        const int16_t *ubuf[2], const int16_t *vbuf[2],
 825                        const int16_t *abuf0, uint8_t *dest, int dstW,
 826                        int uvalpha, int y, enum PixelFormat target)
 827 {
 828     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 829                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 830     int i;
 831
 832     if (uvalpha < 2048) {
 833         for (i = 0; i < (dstW >> 1); i++) {
 834             int Y1 = buf0[i * 2]     >> 7;
 835             int Y2 = buf0[i * 2 + 1] >> 7;
 836             int U  = ubuf1[i]        >> 7;
 837             int V  = vbuf1[i]        >> 7;
 838             const uint8_t *r = (const uint8_t *) c->table_rV[V],
 839                           *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 840                           *b = (const uint8_t *) c->table_bU[U];
 841
 842             dest[ 0] = dest[ 1] = r_b[Y1];
 843             dest[ 2] = dest[ 3] =   g[Y1];
 844             dest[ 4] = dest[ 5] = b_r[Y1];
 845             dest[ 6] = dest[ 7] = r_b[Y2];
 846             dest[ 8] = dest[ 9] =   g[Y2];
 847             dest[10] = dest[11] = b_r[Y2];
 848             dest += 12;
 849         }
 850     } else {
 851         for (i = 0; i < (dstW >> 1); i++) {
 852             int Y1 =  buf0[i * 2]          >> 7;
 853             int Y2 =  buf0[i * 2 + 1]      >> 7;
 854             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 855             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 856             const uint8_t *r = (const uint8_t *) c->table_rV[V],
 857                           *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 858                           *b = (const uint8_t *) c->table_bU[U];
 859
 860             dest[ 0] = dest[ 1] = r_b[Y1];
 861             dest[ 2] = dest[ 3] =   g[Y1];
 862             dest[ 4] = dest[ 5] = b_r[Y1];
 863             dest[ 6] = dest[ 7] = r_b[Y2];
 864             dest[ 8] = dest[ 9] =   g[Y2];
 865             dest[10] = dest[11] = b_r[Y2];
 866             dest += 12;
 867         }
 868     }
 869 }
 870
 871 #undef r_b
 872 #undef b_r
 873
 874 YUV2PACKEDWRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
 875 //YUV2PACKEDWRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
 876 YUV2PACKEDWRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
 877 //YUV2PACKEDWRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
 878
 879 static av_always_inline void
 880 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
 881               int U, int V, int A1, int A2,
 882               const void *_r, const void *_g, const void *_b, int y,
 883               enum PixelFormat target, int hasAlpha)
 884 {
 885     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 886         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
 887         uint32_t *dest = (uint32_t *) _dest;
 888         const uint32_t *r = (const uint32_t *) _r;
 889         const uint32_t *g = (const uint32_t *) _g;
 890         const uint32_t *b = (const uint32_t *) _b;
 891
 892 #if CONFIG_SMALL
 893         int sh = hasAlpha ? ((fmt == PIX_FMT_RGB32_1 || fmt == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 894
 895         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
 896         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
 897 #else
 898         if (hasAlpha) {
 899             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
 900
 901             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
 902             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
 903         } else {
 904             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
 905             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
 906         }
 907 #endif
 908     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
 909         uint8_t *dest = (uint8_t *) _dest;
 910         const uint8_t *r = (const uint8_t *) _r;
 911         const uint8_t *g = (const uint8_t *) _g;
 912         const uint8_t *b = (const uint8_t *) _b;
 913
 914 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
 915 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
 916         dest[i * 6 + 0] = r_b[Y1];
 917         dest[i * 6 + 1] =   g[Y1];
 918         dest[i * 6 + 2] = b_r[Y1];
 919         dest[i * 6 + 3] = r_b[Y2];
 920         dest[i * 6 + 4] =   g[Y2];
 921         dest[i * 6 + 5] = b_r[Y2];
 922 #undef r_b
 923 #undef b_r
 924     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
 925                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
 926                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
 927         uint16_t *dest = (uint16_t *) _dest;
 928         const uint16_t *r = (const uint16_t *) _r;
 929         const uint16_t *g = (const uint16_t *) _g;
 930         const uint16_t *b = (const uint16_t *) _b;
 931         int dr1, dg1, db1, dr2, dg2, db2;
 932
 933         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
 934             dr1 = dither_2x2_8[ y & 1     ][0];
 935             dg1 = dither_2x2_4[ y & 1     ][0];
 936             db1 = dither_2x2_8[(y & 1) ^ 1][0];
 937             dr2 = dither_2x2_8[ y & 1     ][1];
 938             dg2 = dither_2x2_4[ y & 1     ][1];
 939             db2 = dither_2x2_8[(y & 1) ^ 1][1];
 940         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
 941             dr1 = dither_2x2_8[ y & 1     ][0];
 942             dg1 = dither_2x2_8[ y & 1     ][1];
 943             db1 = dither_2x2_8[(y & 1) ^ 1][0];
 944             dr2 = dither_2x2_8[ y & 1     ][1];
 945             dg2 = dither_2x2_8[ y & 1     ][0];
 946             db2 = dither_2x2_8[(y & 1) ^ 1][1];
 947         } else {
 948             dr1 = dither_4x4_16[ y & 3     ][0];
 949             dg1 = dither_4x4_16[ y & 3     ][1];
 950             db1 = dither_4x4_16[(y & 3) ^ 3][0];
 951             dr2 = dither_4x4_16[ y & 3     ][1];
 952             dg2 = dither_4x4_16[ y & 3     ][0];
 953             db2 = dither_4x4_16[(y & 3) ^ 3][1];
 954         }
 955
 956         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
 957         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
 958     } else /* 8/4-bit */ {
 959         uint8_t *dest = (uint8_t *) _dest;
 960         const uint8_t *r = (const uint8_t *) _r;
 961         const uint8_t *g = (const uint8_t *) _g;
 962         const uint8_t *b = (const uint8_t *) _b;
 963         int dr1, dg1, db1, dr2, dg2, db2;
 964
 965         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
 966             const uint8_t * const d64 = dither_8x8_73[y & 7];
 967             const uint8_t * const d32 = dither_8x8_32[y & 7];
 968             dr1 = dg1 = d32[(i * 2 + 0) & 7];
 969             db1 =       d64[(i * 2 + 0) & 7];
 970             dr2 = dg2 = d32[(i * 2 + 1) & 7];
 971             db2 =       d64[(i * 2 + 1) & 7];
 972         } else {
 973             const uint8_t * const d64  = dither_8x8_73 [y & 7];
 974             const uint8_t * const d128 = dither_8x8_220[y & 7];
 975             dr1 = db1 = d128[(i * 2 + 0) & 7];
 976             dg1 =        d64[(i * 2 + 0) & 7];
 977             dr2 = db2 = d128[(i * 2 + 1) & 7];
 978             dg2 =        d64[(i * 2 + 1) & 7];
 979         }
 980
 981         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
 982             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
 983                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
 984         } else {
 985             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
 986             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
 987         }
 988     }
 989 }
 990
 991 static av_always_inline void
 992 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
 993                      const int16_t **lumSrc, int lumFilterSize,
 994                      const int16_t *chrFilter, const int16_t **chrUSrc,
 995                      const int16_t **chrVSrc, int chrFilterSize,
 996                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 997                      int y, enum PixelFormat target, int hasAlpha)
 998 {
 999     int i;
1000
1001     for (i = 0; i < (dstW >> 1); i++) {
1002         int j;
1003         int Y1 = 1 << 18;
1004         int Y2 = 1 << 18;
1005         int U  = 1 << 18;
1006         int V  = 1 << 18;
1007         int av_unused A1, A2;
1008         const void *r, *g, *b;
1009
1010         for (j = 0; j < lumFilterSize; j++) {
1011             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1012             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1013         }
1014         for (j = 0; j < chrFilterSize; j++) {
1015             U += chrUSrc[j][i] * chrFilter[j];
1016             V += chrVSrc[j][i] * chrFilter[j];
1017         }
1018         Y1 >>= 19;
1019         Y2 >>= 19;
1020         U  >>= 19;
1021         V  >>= 19;
1022         if ((Y1 | Y2 | U | V) & 0x100) {
1023             Y1 = av_clip_uint8(Y1);
1024             Y2 = av_clip_uint8(Y2);
1025             U  = av_clip_uint8(U);
1026             V  = av_clip_uint8(V);
1027         }
1028         if (hasAlpha) {\
1029             A1 = 1 << 18;
1030             A2 = 1 << 18;
1031             for (j = 0; j < lumFilterSize; j++) {
1032                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1033                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1034             }
1035             A1 >>= 19;
1036             A2 >>= 19;
1037             if ((A1 | A2) & 0x100) {
1038                 A1 = av_clip_uint8(A1);
1039                 A2 = av_clip_uint8(A2);
1040             }
1041         }
1042
1043         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1044         r =  c->table_rV[V];
1045         g = (c->table_gU[U] + c->table_gV[V]);
1046         b =  c->table_bU[U];
1047
1048         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1049                       r, g, b, y, target, hasAlpha);
1050     }
1051 }
1052
1053 static av_always_inline void
1054 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1055                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1056                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1057                      int yalpha, int uvalpha, int y,
1058                      enum PixelFormat target, int hasAlpha)
1059 {
1060     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1061                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1062                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1063                   *abuf0 = abuf[0], *abuf1 = abuf[1];
1064     int  yalpha1 = 4095 - yalpha;
1065     int uvalpha1 = 4095 - uvalpha;
1066     int i;
1067
1068     for (i = 0; i < (dstW >> 1); i++) {
1069         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1070         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1071         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1072         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1073         int A1, A2;
1074         const void *r =  c->table_rV[V],
1075                    *g = (c->table_gU[U] + c->table_gV[V]),
1076                    *b =  c->table_bU[U];
1077
1078         if (hasAlpha) {
1079             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1080             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1081         }
1082
1083         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1084                       r, g, b, y, target, hasAlpha);
1085     }
1086 }
1087
1088 static av_always_inline void
1089 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1090                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1091                      const int16_t *abuf0, uint8_t *dest, int dstW,
1092                      int uvalpha, int y, enum PixelFormat target,
1093                      int hasAlpha)
1094 {
1095     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1096                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1097     int i;
1098
1099     if (uvalpha < 2048) {
1100         for (i = 0; i < (dstW >> 1); i++) {
1101             int Y1 = buf0[i * 2]     >> 7;
1102             int Y2 = buf0[i * 2 + 1] >> 7;
1103             int U  = ubuf1[i]        >> 7;
1104             int V  = vbuf1[i]        >> 7;
1105             int A1, A2;
1106             const void *r =  c->table_rV[V],
1107                        *g = (c->table_gU[U] + c->table_gV[V]),
1108                        *b =  c->table_bU[U];
1109
1110             if (hasAlpha) {
1111                 A1 = abuf0[i * 2    ] >> 7;
1112                 A2 = abuf0[i * 2 + 1] >> 7;
1113             }
1114
1115             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1116                           r, g, b, y, target, hasAlpha);
1117         }
1118     } else {
1119         for (i = 0; i < (dstW >> 1); i++) {
1120             int Y1 =  buf0[i * 2]          >> 7;
1121             int Y2 =  buf0[i * 2 + 1]      >> 7;
1122             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1123             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1124             int A1, A2;
1125             const void *r =  c->table_rV[V],
1126                        *g = (c->table_gU[U] + c->table_gV[V]),
1127                        *b =  c->table_bU[U];
1128
1129             if (hasAlpha) {
1130                 A1 = abuf0[i * 2    ] >> 7;
1131                 A2 = abuf0[i * 2 + 1] >> 7;
1132             }
1133
1134             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1135                           r, g, b, y, target, hasAlpha);
1136         }
1137     }
1138 }
1139
1140 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1141 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1142                                 const int16_t **lumSrc, int lumFilterSize, \
1143                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1144                                 const int16_t **chrVSrc, int chrFilterSize, \
1145                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1146                                 int y) \
1147 { \
1148     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1149                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1150                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1151 }
1152 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1153 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1154 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1155                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1156                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1157                                 int yalpha, int uvalpha, int y) \
1158 { \
1159     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1160                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1161 } \
1162  \
1163 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1164                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1165                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1166                                 int uvalpha, int y) \
1167 { \
1168     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1169                                   dstW, uvalpha, y, fmt, hasAlpha); \
1170 }
1171
1172 #if CONFIG_SMALL
1173 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1174 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1175 #else
1176 #if CONFIG_SWSCALE_ALPHA
1177 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1178 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1179 #endif
1180 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1181 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1182 #endif
1183 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1184 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1185 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1186 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1187 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1188 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1189 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1190 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1191
1192 static av_always_inline void
1193 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1194                           const int16_t **lumSrc, int lumFilterSize,
1195                           const int16_t *chrFilter, const int16_t **chrUSrc,
1196                           const int16_t **chrVSrc, int chrFilterSize,
1197                           const int16_t **alpSrc, uint8_t *dest,
1198                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1199 {
1200     int i;
1201     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1202
1203     for (i = 0; i < dstW; i++) {
1204         int j;
1205         int Y = 0;
1206         int U = -128 << 19;
1207         int V = -128 << 19;
1208         int av_unused A;
1209         int R, G, B;
1210
1211         for (j = 0; j < lumFilterSize; j++) {
1212             Y += lumSrc[j][i] * lumFilter[j];
1213         }
1214         for (j = 0; j < chrFilterSize; j++) {
1215             U += chrUSrc[j][i] * chrFilter[j];
1216             V += chrVSrc[j][i] * chrFilter[j];
1217         }
1218         Y >>= 10;
1219         U >>= 10;
1220         V >>= 10;
1221         if (hasAlpha) {
1222             A = 1 << 21;
1223             for (j = 0; j < lumFilterSize; j++) {
1224                 A += alpSrc[j][i] * lumFilter[j];
1225             }
1226             A >>= 19;
1227             if (A & 0x100)
1228                 A = av_clip_uint8(A);
1229         }
1230         Y -= c->yuv2rgb_y_offset;
1231         Y *= c->yuv2rgb_y_coeff;
1232         Y += 1 << 21;
1233         R = Y + V*c->yuv2rgb_v2r_coeff;
1234         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1235         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1236         if ((R | G | B) & 0xC0000000) {
1237             R = av_clip_uintp2(R, 30);
1238             G = av_clip_uintp2(G, 30);
1239             B = av_clip_uintp2(B, 30);
1240         }
1241
1242         switch(target) {
1243         case PIX_FMT_ARGB:
1244             dest[0] = hasAlpha ? A : 255;
1245             dest[1] = R >> 22;
1246             dest[2] = G >> 22;
1247             dest[3] = B >> 22;
1248             break;
1249         case PIX_FMT_RGB24:
1250             dest[0] = R >> 22;
1251             dest[1] = G >> 22;
1252             dest[2] = B >> 22;
1253             break;
1254         case PIX_FMT_RGBA:
1255             dest[0] = R >> 22;
1256             dest[1] = G >> 22;
1257             dest[2] = B >> 22;
1258             dest[3] = hasAlpha ? A : 255;
1259             break;
1260         case PIX_FMT_ABGR:
1261             dest[0] = hasAlpha ? A : 255;
1262             dest[1] = B >> 22;
1263             dest[2] = G >> 22;
1264             dest[3] = R >> 22;
1265             dest += 4;
1266             break;
1267         case PIX_FMT_BGR24:
1268             dest[0] = B >> 22;
1269             dest[1] = G >> 22;
1270             dest[2] = R >> 22;
1271             break;
1272         case PIX_FMT_BGRA:
1273             dest[0] = B >> 22;
1274             dest[1] = G >> 22;
1275             dest[2] = R >> 22;
1276             dest[3] = hasAlpha ? A : 255;
1277             break;
1278         }
1279         dest += step;
1280     }
1281 }
1282
1283 #if CONFIG_SMALL
1284 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1285 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1286 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1287 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1288 #else
1289 #if CONFIG_SWSCALE_ALPHA
1290 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1291 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1292 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1293 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1294 #endif
1295 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1296 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1297 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1298 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1299 #endif
1300 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1301 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1302
1303 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1304                                        int width, int height,
1305                                        int y, uint8_t val)
1306 {
1307     int i;
1308     uint8_t *ptr = plane + stride*y;
1309     for (i=0; i<height; i++) {
1310         memset(ptr, val, width);
1311         ptr += stride;
1312     }
1313 }
1314
1315 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1316
1317 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1318 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1319
1320 static av_always_inline void
1321 rgb48ToY_c_template(uint8_t *dst, const uint8_t *src, int width,
1322                     enum PixelFormat origin)
1323 {
1324     int i;
1325     for (i = 0; i < width; i++) {
1326         int r_b = input_pixel(&src[i*6+0]) >> 8;
1327         int   g = input_pixel(&src[i*6+2]) >> 8;
1328         int b_r = input_pixel(&src[i*6+4]) >> 8;
1329
1330         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1331     }
1332 }
1333
1334 static av_always_inline void
1335 rgb48ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1336                     const uint8_t *src1, const uint8_t *src2,
1337                     int width, enum PixelFormat origin)
1338 {
1339     int i;
1340     assert(src1==src2);
1341     for (i = 0; i < width; i++) {
1342         int r_b = input_pixel(&src1[i*6+0]) >> 8;
1343         int   g = input_pixel(&src1[i*6+2]) >> 8;
1344         int b_r = input_pixel(&src1[i*6+4]) >> 8;
1345
1346         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1347         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1348     }
1349 }
1350
1351 static av_always_inline void
1352 rgb48ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1353                           const uint8_t *src1, const uint8_t *src2,
1354                           int width, enum PixelFormat origin)
1355 {
1356     int i;
1357     assert(src1==src2);
1358     for (i = 0; i < width; i++) {
1359         int r_b = (input_pixel(&src1[12*i + 0]) >> 8) + (input_pixel(&src1[12*i + 6]) >> 8);
1360         int   g = (input_pixel(&src1[12*i + 2]) >> 8) + (input_pixel(&src1[12*i + 8]) >> 8);
1361         int b_r = (input_pixel(&src1[12*i + 4]) >> 8) + (input_pixel(&src1[12*i + 10]) >> 8);
1362
1363         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1364         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1365     }
1366 }
1367
1368 #undef r
1369 #undef b
1370 #undef input_pixel
1371
1372 #define rgb48funcs(pattern, BE_LE, origin) \
1373 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *dst, const uint8_t *src, \
1374                                     int width, uint32_t *unused) \
1375 { \
1376     rgb48ToY_c_template(dst, src, width, origin); \
1377 } \
1378  \
1379 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1380                                     const uint8_t *src1, const uint8_t *src2, \
1381                                     int width, uint32_t *unused) \
1382 { \
1383     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1384 } \
1385  \
1386 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1387                                     const uint8_t *src1, const uint8_t *src2, \
1388                                     int width, uint32_t *unused) \
1389 { \
1390     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1391 }
1392
1393 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1394 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1395 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1396 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1397
1398 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1399                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1400                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1401
1402 static av_always_inline void
1403 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1404                        int width, enum PixelFormat origin,
1405                        int shr,   int shg,   int shb, int shp,
1406                        int maskr, int maskg, int maskb,
1407                        int rsh,   int gsh,   int bsh, int S)
1408 {
1409     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1410               rnd = 33 << (S - 1);
1411     int i;
1412
1413     for (i = 0; i < width; i++) {
1414         int px = input_pixel(i) >> shp;
1415         int b = (px & maskb) >> shb;
1416         int g = (px & maskg) >> shg;
1417         int r = (px & maskr) >> shr;
1418
1419         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1420     }
1421 }
1422
1423 static av_always_inline void
1424 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1425                         const uint8_t *src, int width,
1426                         enum PixelFormat origin,
1427                         int shr,   int shg,   int shb, int shp,
1428                         int maskr, int maskg, int maskb,
1429                         int rsh,   int gsh,   int bsh, int S)
1430 {
1431     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1432               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1433               rnd = 257 << (S - 1);
1434     int i;
1435
1436     for (i = 0; i < width; i++) {
1437         int px = input_pixel(i) >> shp;
1438         int b = (px & maskb) >> shb;
1439         int g = (px & maskg) >> shg;
1440         int r = (px & maskr) >> shr;
1441
1442         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1443         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1444     }
1445 }
1446
1447 static av_always_inline void
1448 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1449                              const uint8_t *src, int width,
1450                              enum PixelFormat origin,
1451                              int shr,   int shg,   int shb, int shp,
1452                              int maskr, int maskg, int maskb,
1453                              int rsh,   int gsh,   int bsh, int S)
1454 {
1455     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1456               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1457               rnd = 257 << S, maskgx = ~(maskr | maskb);
1458     int i;
1459
1460     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1461     for (i = 0; i < width; i++) {
1462         int px0 = input_pixel(2 * i + 0) >> shp;
1463         int px1 = input_pixel(2 * i + 1) >> shp;
1464         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1465         int rb = px0 + px1 - g;
1466
1467         b = (rb & maskb) >> shb;
1468         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1469             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1470             g >>= shg;
1471         } else {
1472             g = (g  & maskg) >> shg;
1473         }
1474         r = (rb & maskr) >> shr;
1475
1476         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1477         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1478     }
1479 }
1480
1481 #undef input_pixel
1482
1483 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1484                          maskg, maskb, rsh, gsh, bsh, S) \
1485 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1486                           int width, uint32_t *unused) \
1487 { \
1488     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1489                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1490 } \
1491  \
1492 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1493                            const uint8_t *src, const uint8_t *dummy, \
1494                            int width, uint32_t *unused) \
1495 { \
1496     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1497                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1498 } \
1499  \
1500 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1501                                 const uint8_t *src, const uint8_t *dummy, \
1502                                 int width, uint32_t *unused) \
1503 { \
1504     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1505                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1506 }
1507
1508 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1509 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1510 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1511 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1512 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1513 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1514 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1515 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1516 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1517 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1518 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1519 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1520
1521 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1522 {
1523     int i;
1524     for (i=0; i<width; i++) {
1525         dst[i]= src[4*i];
1526     }
1527 }
1528
1529 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1530 {
1531     int i;
1532     for (i=0; i<width; i++) {
1533         dst[i]= src[4*i+3];
1534     }
1535 }
1536
1537 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1538 {
1539     int i;
1540     for (i=0; i<width; i++) {
1541         int d= src[i];
1542
1543         dst[i]= pal[d] & 0xFF;
1544     }
1545 }
1546
1547 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1548                       const uint8_t *src1, const uint8_t *src2,
1549                       int width, uint32_t *pal)
1550 {
1551     int i;
1552     assert(src1 == src2);
1553     for (i=0; i<width; i++) {
1554         int p= pal[src1[i]];
1555
1556         dstU[i]= p>>8;
1557         dstV[i]= p>>16;
1558     }
1559 }
1560
1561 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1562                           int width, uint32_t *unused)
1563 {
1564     int i, j;
1565     for (i=0; i<width/8; i++) {
1566         int d= ~src[i];
1567         for(j=0; j<8; j++)
1568             dst[8*i+j]= ((d>>(7-j))&1)*255;
1569     }
1570 }
1571
1572 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1573                           int width, uint32_t *unused)
1574 {
1575     int i, j;
1576     for (i=0; i<width/8; i++) {
1577         int d= src[i];
1578         for(j=0; j<8; j++)
1579             dst[8*i+j]= ((d>>(7-j))&1)*255;
1580     }
1581 }
1582
1583 //FIXME yuy2* can read up to 7 samples too much
1584
1585 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1586                       uint32_t *unused)
1587 {
1588     int i;
1589     for (i=0; i<width; i++)
1590         dst[i]= src[2*i];
1591 }
1592
1593 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1594                        const uint8_t *src2, int width, uint32_t *unused)
1595 {
1596     int i;
1597     for (i=0; i<width; i++) {
1598         dstU[i]= src1[4*i + 1];
1599         dstV[i]= src1[4*i + 3];
1600     }
1601     assert(src1 == src2);
1602 }
1603
1604 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1605                      const uint8_t *src2, int width, uint32_t *unused)
1606 {
1607     int i;
1608     for (i=0; i<width; i++) {
1609         dstU[i]= src1[2*i + 1];
1610         dstV[i]= src2[2*i + 1];
1611     }
1612 }
1613
1614 /* This is almost identical to the previous, end exists only because
1615  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1616 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1617                       uint32_t *unused)
1618 {
1619     int i;
1620     for (i=0; i<width; i++)
1621         dst[i]= src[2*i+1];
1622 }
1623
1624 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1625                        const uint8_t *src2, int width, uint32_t *unused)
1626 {
1627     int i;
1628     for (i=0; i<width; i++) {
1629         dstU[i]= src1[4*i + 0];
1630         dstV[i]= src1[4*i + 2];
1631     }
1632     assert(src1 == src2);
1633 }
1634
1635 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1636                      const uint8_t *src2, int width, uint32_t *unused)
1637 {
1638     int i;
1639     for (i=0; i<width; i++) {
1640         dstU[i]= src1[2*i];
1641         dstV[i]= src2[2*i];
1642     }
1643 }
1644
1645 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1646                                         const uint8_t *src, int width)
1647 {
1648     int i;
1649     for (i = 0; i < width; i++) {
1650         dst1[i] = src[2*i+0];
1651         dst2[i] = src[2*i+1];
1652     }
1653 }
1654
1655 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1656                        const uint8_t *src1, const uint8_t *src2,
1657                        int width, uint32_t *unused)
1658 {
1659     nvXXtoUV_c(dstU, dstV, src1, width);
1660 }
1661
1662 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1663                        const uint8_t *src1, const uint8_t *src2,
1664                        int width, uint32_t *unused)
1665 {
1666     nvXXtoUV_c(dstV, dstU, src1, width);
1667 }
1668
1669 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1670
1671 // FIXME Maybe dither instead.
1672 static av_always_inline void
1673 yuv9_OR_10ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1674                           const uint8_t *_srcU, const uint8_t *_srcV,
1675                           int width, enum PixelFormat origin, int depth)
1676 {
1677     int i;
1678     const uint16_t *srcU = (const uint16_t *) _srcU;
1679     const uint16_t *srcV = (const uint16_t *) _srcV;
1680
1681     for (i = 0; i < width; i++) {
1682         dstU[i] = input_pixel(&srcU[i]) >> (depth - 8);
1683         dstV[i] = input_pixel(&srcV[i]) >> (depth - 8);
1684     }
1685 }
1686
1687 static av_always_inline void
1688 yuv9_or_10ToY_c_template(uint8_t *dstY, const uint8_t *_srcY,
1689                          int width, enum PixelFormat origin, int depth)
1690 {
1691     int i;
1692     const uint16_t *srcY = (const uint16_t*)_srcY;
1693
1694     for (i = 0; i < width; i++)
1695         dstY[i] = input_pixel(&srcY[i]) >> (depth - 8);
1696 }
1697
1698 #undef input_pixel
1699
1700 #define YUV_NBPS(depth, BE_LE, origin) \
1701 static void BE_LE ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1702                                      const uint8_t *srcU, const uint8_t *srcV, \
1703                                      int width, uint32_t *unused) \
1704 { \
1705     yuv9_OR_10ToUV_c_template(dstU, dstV, srcU, srcV, width, origin, depth); \
1706 } \
1707 static void BE_LE ## depth ## ToY_c(uint8_t *dstY, const uint8_t *srcY, \
1708                                     int width, uint32_t *unused) \
1709 { \
1710     yuv9_or_10ToY_c_template(dstY, srcY, width, origin, depth); \
1711 }
1712
1713 YUV_NBPS( 9, LE, PIX_FMT_YUV420P9LE);
1714 YUV_NBPS( 9, BE, PIX_FMT_YUV420P9BE);
1715 YUV_NBPS(10, LE, PIX_FMT_YUV420P10LE);
1716 YUV_NBPS(10, BE, PIX_FMT_YUV420P10BE);
1717
1718 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1719                        int width, uint32_t *unused)
1720 {
1721     int i;
1722     for (i=0; i<width; i++) {
1723         int b= src[i*3+0];
1724         int g= src[i*3+1];
1725         int r= src[i*3+2];
1726
1727         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1728     }
1729 }
1730
1731 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1732                         const uint8_t *src2, int width, uint32_t *unused)
1733 {
1734     int i;
1735     for (i=0; i<width; i++) {
1736         int b= src1[3*i + 0];
1737         int g= src1[3*i + 1];
1738         int r= src1[3*i + 2];
1739
1740         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1741         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1742     }
1743     assert(src1 == src2);
1744 }
1745
1746 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1747                              const uint8_t *src2, int width, uint32_t *unused)
1748 {
1749     int i;
1750     for (i=0; i<width; i++) {
1751         int b= src1[6*i + 0] + src1[6*i + 3];
1752         int g= src1[6*i + 1] + src1[6*i + 4];
1753         int r= src1[6*i + 2] + src1[6*i + 5];
1754
1755         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1756         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1757     }
1758     assert(src1 == src2);
1759 }
1760
1761 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1762                        uint32_t *unused)
1763 {
1764     int i;
1765     for (i=0; i<width; i++) {
1766         int r= src[i*3+0];
1767         int g= src[i*3+1];
1768         int b= src[i*3+2];
1769
1770         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1771     }
1772 }
1773
1774 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1775                         const uint8_t *src2, int width, uint32_t *unused)
1776 {
1777     int i;
1778     assert(src1==src2);
1779     for (i=0; i<width; i++) {
1780         int r= src1[3*i + 0];
1781         int g= src1[3*i + 1];
1782         int b= src1[3*i + 2];
1783
1784         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1785         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1786     }
1787 }
1788
1789 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1790                              const uint8_t *src2, int width, uint32_t *unused)
1791 {
1792     int i;
1793     assert(src1==src2);
1794     for (i=0; i<width; i++) {
1795         int r= src1[6*i + 0] + src1[6*i + 3];
1796         int g= src1[6*i + 1] + src1[6*i + 4];
1797         int b= src1[6*i + 2] + src1[6*i + 5];
1798
1799         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1800         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1801     }
1802 }
1803
1804 // bilinear / bicubic scaling
1805 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1806                      const int16_t *filter, const int16_t *filterPos,
1807                      int filterSize)
1808 {
1809     int i;
1810     for (i=0; i<dstW; i++) {
1811         int j;
1812         int srcPos= filterPos[i];
1813         int val=0;
1814         for (j=0; j<filterSize; j++) {
1815             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1816         }
1817         //filter += hFilterSize;
1818         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1819         //dst[i] = val>>7;
1820     }
1821 }
1822
1823 //FIXME all pal and rgb srcFormats could do this convertion as well
1824 //FIXME all scalers more complex than bilinear could do half of this transform
1825 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1826 {
1827     int i;
1828     for (i = 0; i < width; i++) {
1829         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1830         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1831     }
1832 }
1833 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1834 {
1835     int i;
1836     for (i = 0; i < width; i++) {
1837         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1838         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1839     }
1840 }
1841 static void lumRangeToJpeg_c(int16_t *dst, int width)
1842 {
1843     int i;
1844     for (i = 0; i < width; i++)
1845         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1846 }
1847 static void lumRangeFromJpeg_c(int16_t *dst, int width)
1848 {
1849     int i;
1850     for (i = 0; i < width; i++)
1851         dst[i] = (dst[i]*14071 + 33561947)>>14;
1852 }
1853
1854 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1855                            const uint8_t *src, int srcW, int xInc)
1856 {
1857     int i;
1858     unsigned int xpos=0;
1859     for (i=0;i<dstWidth;i++) {
1860         register unsigned int xx=xpos>>16;
1861         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1862         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1863         xpos+=xInc;
1864     }
1865 }
1866
1867 // *** horizontal scale Y line to temp buffer
1868 static av_always_inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1869                                      const uint8_t *src, int srcW, int xInc,
1870                                      const int16_t *hLumFilter,
1871                                      const int16_t *hLumFilterPos, int hLumFilterSize,
1872                                      uint8_t *formatConvBuffer,
1873                                      uint32_t *pal, int isAlpha)
1874 {
1875     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1876     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1877
1878     if (toYV12) {
1879         toYV12(formatConvBuffer, src, srcW, pal);
1880         src= formatConvBuffer;
1881     }
1882
1883     if (!c->hyscale_fast) {
1884         c->hScale(dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
1885     } else { // fast bilinear upscale / crap downscale
1886         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1887     }
1888
1889     if (convertRange)
1890         convertRange(dst, dstWidth);
1891 }
1892
1893 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1894                            int dstWidth, const uint8_t *src1,
1895                            const uint8_t *src2, int srcW, int xInc)
1896 {
1897     int i;
1898     unsigned int xpos=0;
1899     for (i=0;i<dstWidth;i++) {
1900         register unsigned int xx=xpos>>16;
1901         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1902         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1903         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1904         xpos+=xInc;
1905     }
1906 }
1907
1908 static av_always_inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1909                                      const uint8_t *src1, const uint8_t *src2,
1910                                      int srcW, int xInc, const int16_t *hChrFilter,
1911                                      const int16_t *hChrFilterPos, int hChrFilterSize,
1912                                      uint8_t *formatConvBuffer, uint32_t *pal)
1913 {
1914     if (c->chrToYV12) {
1915         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1916         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1917         src1= formatConvBuffer;
1918         src2= buf2;
1919     }
1920
1921     if (!c->hcscale_fast) {
1922         c->hScale(dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
1923         c->hScale(dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
1924     } else { // fast bilinear upscale / crap downscale
1925         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1926     }
1927
1928     if (c->chrConvertRange)
1929         c->chrConvertRange(dst1, dst2, dstWidth);
1930 }
1931
1932 static av_always_inline void
1933 find_c_packed_planar_out_funcs(SwsContext *c,
1934                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
1935                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
1936                                yuv2packedX_fn *yuv2packedX)
1937 {
1938     enum PixelFormat dstFormat = c->dstFormat;
1939
1940     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1941         *yuv2yuvX     = yuv2nv12X_c;
1942     } else if (is16BPS(dstFormat)) {
1943         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
1944     } else if (is9_OR_10BPS(dstFormat)) {
1945         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
1946             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
1947         } else {
1948             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
1949         }
1950     } else {
1951         *yuv2yuv1     = yuv2yuv1_c;
1952         *yuv2yuvX     = yuv2yuvX_c;
1953     }
1954     if(c->flags & SWS_FULL_CHR_H_INT) {
1955         switch (dstFormat) {
1956             case PIX_FMT_RGBA:
1957 #if CONFIG_SMALL
1958                 *yuv2packedX = yuv2rgba32_full_X_c;
1959 #else
1960 #if CONFIG_SWSCALE_ALPHA
1961                 if (c->alpPixBuf) {
1962                     *yuv2packedX = yuv2rgba32_full_X_c;
1963                 } else
1964 #endif /* CONFIG_SWSCALE_ALPHA */
1965                 {
1966                     *yuv2packedX = yuv2rgbx32_full_X_c;
1967                 }
1968 #endif /* !CONFIG_SMALL */
1969                 break;
1970             case PIX_FMT_ARGB:
1971 #if CONFIG_SMALL
1972                 *yuv2packedX = yuv2argb32_full_X_c;
1973 #else
1974 #if CONFIG_SWSCALE_ALPHA
1975                 if (c->alpPixBuf) {
1976                     *yuv2packedX = yuv2argb32_full_X_c;
1977                 } else
1978 #endif /* CONFIG_SWSCALE_ALPHA */
1979                 {
1980                     *yuv2packedX = yuv2xrgb32_full_X_c;
1981                 }
1982 #endif /* !CONFIG_SMALL */
1983                 break;
1984             case PIX_FMT_BGRA:
1985 #if CONFIG_SMALL
1986                 *yuv2packedX = yuv2bgra32_full_X_c;
1987 #else
1988 #if CONFIG_SWSCALE_ALPHA
1989                 if (c->alpPixBuf) {
1990                     *yuv2packedX = yuv2bgra32_full_X_c;
1991                 } else
1992 #endif /* CONFIG_SWSCALE_ALPHA */
1993                 {
1994                     *yuv2packedX = yuv2bgrx32_full_X_c;
1995                 }
1996 #endif /* !CONFIG_SMALL */
1997                 break;
1998             case PIX_FMT_ABGR:
1999 #if CONFIG_SMALL
2000                 *yuv2packedX = yuv2abgr32_full_X_c;
2001 #else
2002 #if CONFIG_SWSCALE_ALPHA
2003                 if (c->alpPixBuf) {
2004                     *yuv2packedX = yuv2abgr32_full_X_c;
2005                 } else
2006 #endif /* CONFIG_SWSCALE_ALPHA */
2007                 {
2008                     *yuv2packedX = yuv2xbgr32_full_X_c;
2009                 }
2010 #endif /* !CONFIG_SMALL */
2011                 break;
2012             case PIX_FMT_RGB24:
2013             *yuv2packedX = yuv2rgb24_full_X_c;
2014             break;
2015         case PIX_FMT_BGR24:
2016             *yuv2packedX = yuv2bgr24_full_X_c;
2017             break;
2018         }
2019     } else {
2020         switch (dstFormat) {
2021         case PIX_FMT_GRAY16BE:
2022             *yuv2packed1 = yuv2gray16BE_1_c;
2023             *yuv2packed2 = yuv2gray16BE_2_c;
2024             *yuv2packedX = yuv2gray16BE_X_c;
2025             break;
2026         case PIX_FMT_GRAY16LE:
2027             *yuv2packed1 = yuv2gray16LE_1_c;
2028             *yuv2packed2 = yuv2gray16LE_2_c;
2029             *yuv2packedX = yuv2gray16LE_X_c;
2030             break;
2031         case PIX_FMT_MONOWHITE:
2032             *yuv2packed1 = yuv2monowhite_1_c;
2033             *yuv2packed2 = yuv2monowhite_2_c;
2034             *yuv2packedX = yuv2monowhite_X_c;
2035             break;
2036         case PIX_FMT_MONOBLACK:
2037             *yuv2packed1 = yuv2monoblack_1_c;
2038             *yuv2packed2 = yuv2monoblack_2_c;
2039             *yuv2packedX = yuv2monoblack_X_c;
2040             break;
2041         case PIX_FMT_YUYV422:
2042             *yuv2packed1 = yuv2yuyv422_1_c;
2043             *yuv2packed2 = yuv2yuyv422_2_c;
2044             *yuv2packedX = yuv2yuyv422_X_c;
2045             break;
2046         case PIX_FMT_UYVY422:
2047             *yuv2packed1 = yuv2uyvy422_1_c;
2048             *yuv2packed2 = yuv2uyvy422_2_c;
2049             *yuv2packedX = yuv2uyvy422_X_c;
2050             break;
2051         case PIX_FMT_RGB48LE:
2052             //*yuv2packed1 = yuv2rgb48le_1_c;
2053             //*yuv2packed2 = yuv2rgb48le_2_c;
2054             //*yuv2packedX = yuv2rgb48le_X_c;
2055             //break;
2056         case PIX_FMT_RGB48BE:
2057             *yuv2packed1 = yuv2rgb48be_1_c;
2058             *yuv2packed2 = yuv2rgb48be_2_c;
2059             *yuv2packedX = yuv2rgb48be_X_c;
2060             break;
2061         case PIX_FMT_BGR48LE:
2062             //*yuv2packed1 = yuv2bgr48le_1_c;
2063             //*yuv2packed2 = yuv2bgr48le_2_c;
2064             //*yuv2packedX = yuv2bgr48le_X_c;
2065             //break;
2066         case PIX_FMT_BGR48BE:
2067             *yuv2packed1 = yuv2bgr48be_1_c;
2068             *yuv2packed2 = yuv2bgr48be_2_c;
2069             *yuv2packedX = yuv2bgr48be_X_c;
2070             break;
2071         case PIX_FMT_RGB32:
2072         case PIX_FMT_BGR32:
2073 #if CONFIG_SMALL
2074             *yuv2packed1 = yuv2rgb32_1_c;
2075             *yuv2packed2 = yuv2rgb32_2_c;
2076             *yuv2packedX = yuv2rgb32_X_c;
2077 #else
2078 #if CONFIG_SWSCALE_ALPHA
2079                 if (c->alpPixBuf) {
2080                     *yuv2packed1 = yuv2rgba32_1_c;
2081                     *yuv2packed2 = yuv2rgba32_2_c;
2082                     *yuv2packedX = yuv2rgba32_X_c;
2083                 } else
2084 #endif /* CONFIG_SWSCALE_ALPHA */
2085                 {
2086                     *yuv2packed1 = yuv2rgbx32_1_c;
2087                     *yuv2packed2 = yuv2rgbx32_2_c;
2088                     *yuv2packedX = yuv2rgbx32_X_c;
2089                 }
2090 #endif /* !CONFIG_SMALL */
2091             break;
2092         case PIX_FMT_RGB32_1:
2093         case PIX_FMT_BGR32_1:
2094 #if CONFIG_SMALL
2095                 *yuv2packed1 = yuv2rgb32_1_1_c;
2096                 *yuv2packed2 = yuv2rgb32_1_2_c;
2097                 *yuv2packedX = yuv2rgb32_1_X_c;
2098 #else
2099 #if CONFIG_SWSCALE_ALPHA
2100                 if (c->alpPixBuf) {
2101                     *yuv2packed1 = yuv2rgba32_1_1_c;
2102                     *yuv2packed2 = yuv2rgba32_1_2_c;
2103                     *yuv2packedX = yuv2rgba32_1_X_c;
2104                 } else
2105 #endif /* CONFIG_SWSCALE_ALPHA */
2106                 {
2107                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2108                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2109                     *yuv2packedX = yuv2rgbx32_1_X_c;
2110                 }
2111 #endif /* !CONFIG_SMALL */
2112                 break;
2113         case PIX_FMT_RGB24:
2114             *yuv2packed1 = yuv2rgb24_1_c;
2115             *yuv2packed2 = yuv2rgb24_2_c;
2116             *yuv2packedX = yuv2rgb24_X_c;
2117             break;
2118         case PIX_FMT_BGR24:
2119             *yuv2packed1 = yuv2bgr24_1_c;
2120             *yuv2packed2 = yuv2bgr24_2_c;
2121             *yuv2packedX = yuv2bgr24_X_c;
2122             break;
2123         case PIX_FMT_RGB565LE:
2124         case PIX_FMT_RGB565BE:
2125         case PIX_FMT_BGR565LE:
2126         case PIX_FMT_BGR565BE:
2127             *yuv2packed1 = yuv2rgb16_1_c;
2128             *yuv2packed2 = yuv2rgb16_2_c;
2129             *yuv2packedX = yuv2rgb16_X_c;
2130             break;
2131         case PIX_FMT_RGB555LE:
2132         case PIX_FMT_RGB555BE:
2133         case PIX_FMT_BGR555LE:
2134         case PIX_FMT_BGR555BE:
2135             *yuv2packed1 = yuv2rgb15_1_c;
2136             *yuv2packed2 = yuv2rgb15_2_c;
2137             *yuv2packedX = yuv2rgb15_X_c;
2138             break;
2139         case PIX_FMT_RGB444LE:
2140         case PIX_FMT_RGB444BE:
2141         case PIX_FMT_BGR444LE:
2142         case PIX_FMT_BGR444BE:
2143             *yuv2packed1 = yuv2rgb12_1_c;
2144             *yuv2packed2 = yuv2rgb12_2_c;
2145             *yuv2packedX = yuv2rgb12_X_c;
2146             break;
2147         case PIX_FMT_RGB8:
2148         case PIX_FMT_BGR8:
2149             *yuv2packed1 = yuv2rgb8_1_c;
2150             *yuv2packed2 = yuv2rgb8_2_c;
2151             *yuv2packedX = yuv2rgb8_X_c;
2152             break;
2153         case PIX_FMT_RGB4:
2154         case PIX_FMT_BGR4:
2155             *yuv2packed1 = yuv2rgb4_1_c;
2156             *yuv2packed2 = yuv2rgb4_2_c;
2157             *yuv2packedX = yuv2rgb4_X_c;
2158             break;
2159         case PIX_FMT_RGB4_BYTE:
2160         case PIX_FMT_BGR4_BYTE:
2161             *yuv2packed1 = yuv2rgb4b_1_c;
2162             *yuv2packed2 = yuv2rgb4b_2_c;
2163             *yuv2packedX = yuv2rgb4b_X_c;
2164             break;
2165         }
2166     }
2167 }
2168
2169 #define DEBUG_SWSCALE_BUFFERS 0
2170 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2171
2172 static int swScale(SwsContext *c, const uint8_t* src[],
2173                    int srcStride[], int srcSliceY,
2174                    int srcSliceH, uint8_t* dst[], int dstStride[])
2175 {
2176     /* load a few things into local vars to make the code more readable? and faster */
2177     const int srcW= c->srcW;
2178     const int dstW= c->dstW;
2179     const int dstH= c->dstH;
2180     const int chrDstW= c->chrDstW;
2181     const int chrSrcW= c->chrSrcW;
2182     const int lumXInc= c->lumXInc;
2183     const int chrXInc= c->chrXInc;
2184     const enum PixelFormat dstFormat= c->dstFormat;
2185     const int flags= c->flags;
2186     int16_t *vLumFilterPos= c->vLumFilterPos;
2187     int16_t *vChrFilterPos= c->vChrFilterPos;
2188     int16_t *hLumFilterPos= c->hLumFilterPos;
2189     int16_t *hChrFilterPos= c->hChrFilterPos;
2190     int16_t *vLumFilter= c->vLumFilter;
2191     int16_t *vChrFilter= c->vChrFilter;
2192     int16_t *hLumFilter= c->hLumFilter;
2193     int16_t *hChrFilter= c->hChrFilter;
2194     int32_t *lumMmxFilter= c->lumMmxFilter;
2195     int32_t *chrMmxFilter= c->chrMmxFilter;
2196     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2197     const int vLumFilterSize= c->vLumFilterSize;
2198     const int vChrFilterSize= c->vChrFilterSize;
2199     const int hLumFilterSize= c->hLumFilterSize;
2200     const int hChrFilterSize= c->hChrFilterSize;
2201     int16_t **lumPixBuf= c->lumPixBuf;
2202     int16_t **chrUPixBuf= c->chrUPixBuf;
2203     int16_t **chrVPixBuf= c->chrVPixBuf;
2204     int16_t **alpPixBuf= c->alpPixBuf;
2205     const int vLumBufSize= c->vLumBufSize;
2206     const int vChrBufSize= c->vChrBufSize;
2207     uint8_t *formatConvBuffer= c->formatConvBuffer;
2208     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2209     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2210     int lastDstY;
2211     uint32_t *pal=c->pal_yuv;
2212     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2213     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2214     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2215     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2216     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2217
2218     /* vars which will change and which we need to store back in the context */
2219     int dstY= c->dstY;
2220     int lumBufIndex= c->lumBufIndex;
2221     int chrBufIndex= c->chrBufIndex;
2222     int lastInLumBuf= c->lastInLumBuf;
2223     int lastInChrBuf= c->lastInChrBuf;
2224
2225     if (isPacked(c->srcFormat)) {
2226         src[0]=
2227         src[1]=
2228         src[2]=
2229         src[3]= src[0];
2230         srcStride[0]=
2231         srcStride[1]=
2232         srcStride[2]=
2233         srcStride[3]= srcStride[0];
2234     }
2235     srcStride[1]<<= c->vChrDrop;
2236     srcStride[2]<<= c->vChrDrop;
2237
2238     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2239                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2240                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2241     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2242                    srcSliceY,    srcSliceH,    dstY,    dstH);
2243     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2244                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2245
2246     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2247         static int warnedAlready=0; //FIXME move this into the context perhaps
2248         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2249             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2250                    "         ->cannot do aligned memory accesses anymore\n");
2251             warnedAlready=1;
2252         }
2253     }
2254
2255     /* Note the user might start scaling the picture in the middle so this
2256        will not get executed. This is not really intended but works
2257        currently, so people might do it. */
2258     if (srcSliceY ==0) {
2259         lumBufIndex=-1;
2260         chrBufIndex=-1;
2261         dstY=0;
2262         lastInLumBuf= -1;
2263         lastInChrBuf= -1;
2264     }
2265
2266     lastDstY= dstY;
2267
2268     for (;dstY < dstH; dstY++) {
2269         const int chrDstY= dstY>>c->chrDstVSubSample;
2270         uint8_t *dest[4] = {
2271             dst[0] + dstStride[0] * dstY,
2272             dst[1] + dstStride[1] * chrDstY,
2273             dst[2] + dstStride[2] * chrDstY,
2274             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2275         };
2276
2277         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2278         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2279         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2280         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2281         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2282         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2283         int enough_lines;
2284
2285         //handle holes (FAST_BILINEAR & weird filters)
2286         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2287         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2288         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2289         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2290
2291         DEBUG_BUFFERS("dstY: %d\n", dstY);
2292         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2293                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2294         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2295                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2296
2297         // Do we have enough lines in this slice to output the dstY line
2298         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2299
2300         if (!enough_lines) {
2301             lastLumSrcY = srcSliceY + srcSliceH - 1;
2302             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2303             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2304                                             lastLumSrcY, lastChrSrcY);
2305         }
2306
2307         //Do horizontal scaling
2308         while(lastInLumBuf < lastLumSrcY) {
2309             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2310             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2311             lumBufIndex++;
2312             assert(lumBufIndex < 2*vLumBufSize);
2313             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2314             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2315             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2316                     hLumFilter, hLumFilterPos, hLumFilterSize,
2317                     formatConvBuffer,
2318                     pal, 0);
2319             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2320                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2321                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2322                         formatConvBuffer,
2323                         pal, 1);
2324             lastInLumBuf++;
2325             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2326                                lumBufIndex,    lastInLumBuf);
2327         }
2328         while(lastInChrBuf < lastChrSrcY) {
2329             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2330             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2331             chrBufIndex++;
2332             assert(chrBufIndex < 2*vChrBufSize);
2333             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2334             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2335             //FIXME replace parameters through context struct (some at least)
2336
2337             if (c->needs_hcscale)
2338                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2339                           chrDstW, src1, src2, chrSrcW, chrXInc,
2340                           hChrFilter, hChrFilterPos, hChrFilterSize,
2341                           formatConvBuffer, pal);
2342             lastInChrBuf++;
2343             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2344                                chrBufIndex,    lastInChrBuf);
2345         }
2346         //wrap buf index around to stay inside the ring buffer
2347         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2348         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2349         if (!enough_lines)
2350             break; //we can't output a dstY line so let's try with the next slice
2351
2352 #if HAVE_MMX
2353         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2354 #endif
2355         if (dstY >= dstH-2) {
2356             // hmm looks like we can't use MMX here without overwriting this array's tail
2357             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2358                                            &yuv2packed1, &yuv2packed2,
2359                                            &yuv2packedX);
2360         }
2361
2362         {
2363             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2364             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2365             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2366             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2367             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2368                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2369                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2370                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2371                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2372                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2373                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2374                              dest, dstW, chrDstW);
2375                 } else { //General YV12
2376                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2377                              lumSrcPtr, vLumFilterSize,
2378                              vChrFilter + chrDstY * vChrFilterSize,
2379                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2380                              alpSrcPtr, dest, dstW, chrDstW);
2381                 }
2382             } else {
2383                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2384                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2385                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2386                     int chrAlpha = vChrFilter[2 * dstY + 1];
2387                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2388                                 alpPixBuf ? *alpSrcPtr : NULL,
2389                                 dest[0], dstW, chrAlpha, dstY);
2390                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2391                     int lumAlpha = vLumFilter[2 * dstY + 1];
2392                     int chrAlpha = vChrFilter[2 * dstY + 1];
2393                     lumMmxFilter[2] =
2394                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2395                     chrMmxFilter[2] =
2396                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2397                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2398                                 alpPixBuf ? alpSrcPtr : NULL,
2399                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2400                 } else { //general RGB
2401                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2402                                 lumSrcPtr, vLumFilterSize,
2403                                 vChrFilter + dstY * vChrFilterSize,
2404                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2405                                 alpSrcPtr, dest[0], dstW, dstY);
2406                 }
2407             }
2408         }
2409     }
2410
2411     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2412         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2413
2414 #if HAVE_MMX2
2415     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2416         __asm__ volatile("sfence":::"memory");
2417 #endif
2418     emms_c();
2419
2420     /* store changed local vars back in the context */
2421     c->dstY= dstY;
2422     c->lumBufIndex= lumBufIndex;
2423     c->chrBufIndex= chrBufIndex;
2424     c->lastInLumBuf= lastInLumBuf;
2425     c->lastInChrBuf= lastInChrBuf;
2426
2427     return dstY - lastDstY;
2428 }
2429
2430 static av_cold void sws_init_swScale_c(SwsContext *c)
2431 {
2432     enum PixelFormat srcFormat = c->srcFormat;
2433
2434     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2435                                    &c->yuv2packed1, &c->yuv2packed2,
2436                                    &c->yuv2packedX);
2437
2438     c->hScale       = hScale_c;
2439
2440     if (c->flags & SWS_FAST_BILINEAR) {
2441         c->hyscale_fast = hyscale_fast_c;
2442         c->hcscale_fast = hcscale_fast_c;
2443     }
2444
2445     c->chrToYV12 = NULL;
2446     switch(srcFormat) {
2447         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2448         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2449         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2450         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2451         case PIX_FMT_RGB8     :
2452         case PIX_FMT_BGR8     :
2453         case PIX_FMT_PAL8     :
2454         case PIX_FMT_BGR4_BYTE:
2455         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2456         case PIX_FMT_YUV444P9BE:
2457         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
2458         case PIX_FMT_YUV444P9LE:
2459         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
2460         case PIX_FMT_YUV444P10BE:
2461         case PIX_FMT_YUV422P10BE:
2462         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
2463         case PIX_FMT_YUV422P10LE:
2464         case PIX_FMT_YUV444P10LE:
2465         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
2466         case PIX_FMT_YUV420P16BE:
2467         case PIX_FMT_YUV422P16BE:
2468         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
2469         case PIX_FMT_YUV420P16LE:
2470         case PIX_FMT_YUV422P16LE:
2471         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
2472     }
2473     if (c->chrSrcHSubSample) {
2474         switch(srcFormat) {
2475         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2476         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2477         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2478         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2479         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2480         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2481         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2482         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2483         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2484         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2485         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2486         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2487         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2488         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2489         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2490         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2491         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2492         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2493         }
2494     } else {
2495         switch(srcFormat) {
2496         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2497         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2498         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2499         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2500         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2501         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2502         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2503         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2504         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2505         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2506         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2507         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2508         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2509         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2510         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2511         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2512         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2513         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2514         }
2515     }
2516
2517     c->lumToYV12 = NULL;
2518     c->alpToYV12 = NULL;
2519     switch (srcFormat) {
2520     case PIX_FMT_YUV444P9BE:
2521     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
2522     case PIX_FMT_YUV444P9LE:
2523     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
2524     case PIX_FMT_YUV444P10BE:
2525     case PIX_FMT_YUV422P10BE:
2526     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
2527     case PIX_FMT_YUV444P10LE:
2528     case PIX_FMT_YUV422P10LE:
2529     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
2530     case PIX_FMT_YUYV422  :
2531     case PIX_FMT_YUV420P16BE:
2532     case PIX_FMT_YUV422P16BE:
2533     case PIX_FMT_YUV444P16BE:
2534     case PIX_FMT_Y400A    :
2535     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
2536     case PIX_FMT_UYVY422  :
2537     case PIX_FMT_YUV420P16LE:
2538     case PIX_FMT_YUV422P16LE:
2539     case PIX_FMT_YUV444P16LE:
2540     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c;    break;
2541     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2542     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2543     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2544     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2545     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2546     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2547     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2548     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2549     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2550     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2551     case PIX_FMT_RGB8     :
2552     case PIX_FMT_BGR8     :
2553     case PIX_FMT_PAL8     :
2554     case PIX_FMT_BGR4_BYTE:
2555     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2556     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2557     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2558     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2559     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2560     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2561     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2562     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2563     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2564     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2565     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2566     }
2567     if (c->alpPixBuf) {
2568         switch (srcFormat) {
2569         case PIX_FMT_BGRA:
2570         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2571         case PIX_FMT_ABGR:
2572         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2573         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2574         }
2575     }
2576
2577     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2578         if (c->srcRange) {
2579             c->lumConvertRange = lumRangeFromJpeg_c;
2580             c->chrConvertRange = chrRangeFromJpeg_c;
2581         } else {
2582             c->lumConvertRange = lumRangeToJpeg_c;
2583             c->chrConvertRange = chrRangeToJpeg_c;
2584         }
2585     }
2586
2587     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2588           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2589         c->needs_hcscale = 1;
2590 }
2591
2592 SwsFunc ff_getSwsFunc(SwsContext *c)
2593 {
2594     sws_init_swScale_c(c);
2595
2596     if (HAVE_MMX)
2597         ff_sws_init_swScale_mmx(c);
2598     if (HAVE_ALTIVEC)
2599         ff_sws_init_swScale_altivec(c);
2600
2601     return swScale;
2602 }