git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int32_t **chrUSrc, const int32_t **chrVSrc,
 201                       int chrFilterSize, const int32_t **alpSrc,
 202                       uint16_t *dest[4], int dstW, int chrDstW,
 203                       int big_endian, int output_bits)
 204 {
 205     //FIXME Optimize (just quickly written not optimized..)
 206     int i;
 207     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 208              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 209     int shift = 15 + 16 - output_bits;
 210
 211 #define output_pixel(pos, val) \
 212     if (big_endian) { \
 213         if (output_bits == 16) { \
 214             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 215         } else { \
 216             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 217         } \
 218     } else { \
 219         if (output_bits == 16) { \
 220             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 221         } else { \
 222             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 223         } \
 224     }
 225     for (i = 0; i < dstW; i++) {
 226         int val = 1 << (30-output_bits);
 227         int j;
 228
 229         for (j = 0; j < lumFilterSize; j++)
 230             val += lumSrc[j][i] * lumFilter[j];
 231
 232         output_pixel(&yDest[i], val);
 233     }
 234
 235     if (uDest) {
 236         for (i = 0; i < chrDstW; i++) {
 237             int u = 1 << (30-output_bits);
 238             int v = 1 << (30-output_bits);
 239             int j;
 240
 241             for (j = 0; j < chrFilterSize; j++) {
 242                 u += chrUSrc[j][i] * chrFilter[j];
 243                 v += chrVSrc[j][i] * chrFilter[j];
 244             }
 245
 246             output_pixel(&uDest[i], u);
 247             output_pixel(&vDest[i], v);
 248         }
 249     }
 250
 251     if (CONFIG_SWSCALE_ALPHA && aDest) {
 252         for (i = 0; i < dstW; i++) {
 253             int val = 1 << (30-output_bits);
 254             int j;
 255
 256             for (j = 0; j < lumFilterSize; j++)
 257                 val += alpSrc[j][i] * lumFilter[j];
 258
 259             output_pixel(&aDest[i], val);
 260         }
 261     }
 262 #undef output_pixel
 263 }
 264
 265 #define yuv2NBPS(bits, BE_LE, is_be) \
 266 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 267                               const int16_t **_lumSrc, int lumFilterSize, \
 268                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
 269                               const int16_t **_chrVSrc, \
 270                               int chrFilterSize, const int16_t **_alpSrc, \
 271                               uint8_t *_dest[4], int dstW, int chrDstW) \
 272 { \
 273     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 274                   **chrUSrc = (const int32_t **) _chrUSrc, \
 275                   **chrVSrc = (const int32_t **) _chrVSrc, \
 276                   **alpSrc  = (const int32_t **) _alpSrc; \
 277     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 278                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 279                           alpSrc, (uint16_t **) _dest, \
 280                           dstW, chrDstW, is_be, bits); \
 281 }
 282 yuv2NBPS( 9, BE, 1);
 283 yuv2NBPS( 9, LE, 0);
 284 yuv2NBPS(10, BE, 1);
 285 yuv2NBPS(10, LE, 0);
 286 yuv2NBPS(16, BE, 1);
 287 yuv2NBPS(16, LE, 0);
 288
 289 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 290                        const int16_t **lumSrc, int lumFilterSize,
 291                        const int16_t *chrFilter, const int16_t **chrUSrc,
 292                        const int16_t **chrVSrc,
 293                        int chrFilterSize, const int16_t **alpSrc,
 294                        uint8_t *dest[4], int dstW, int chrDstW)
 295 {
 296     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 297             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 298     int i;
 299
 300     //FIXME Optimize (just quickly written not optimized..)
 301     for (i=0; i<dstW; i++) {
 302         int val=1<<18;
 303         int j;
 304         for (j=0; j<lumFilterSize; j++)
 305             val += lumSrc[j][i] * lumFilter[j];
 306
 307         yDest[i]= av_clip_uint8(val>>19);
 308     }
 309
 310     if (uDest)
 311         for (i=0; i<chrDstW; i++) {
 312             int u=1<<18;
 313             int v=1<<18;
 314             int j;
 315             for (j=0; j<chrFilterSize; j++) {
 316                 u += chrUSrc[j][i] * chrFilter[j];
 317                 v += chrVSrc[j][i] * chrFilter[j];
 318             }
 319
 320             uDest[i]= av_clip_uint8(u>>19);
 321             vDest[i]= av_clip_uint8(v>>19);
 322         }
 323
 324     if (CONFIG_SWSCALE_ALPHA && aDest)
 325         for (i=0; i<dstW; i++) {
 326             int val=1<<18;
 327             int j;
 328             for (j=0; j<lumFilterSize; j++)
 329                 val += alpSrc[j][i] * lumFilter[j];
 330
 331             aDest[i]= av_clip_uint8(val>>19);
 332         }
 333 }
 334
 335 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 336                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 337                        const int16_t *alpSrc,
 338                        uint8_t *dest[4], int dstW, int chrDstW)
 339 {
 340     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 341             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 342     int i;
 343
 344     for (i=0; i<dstW; i++) {
 345         int val= (lumSrc[i]+64)>>7;
 346         yDest[i]= av_clip_uint8(val);
 347     }
 348
 349     if (uDest)
 350         for (i=0; i<chrDstW; i++) {
 351             int u=(chrUSrc[i]+64)>>7;
 352             int v=(chrVSrc[i]+64)>>7;
 353             uDest[i]= av_clip_uint8(u);
 354             vDest[i]= av_clip_uint8(v);
 355         }
 356
 357     if (CONFIG_SWSCALE_ALPHA && aDest)
 358         for (i=0; i<dstW; i++) {
 359             int val= (alpSrc[i]+64)>>7;
 360             aDest[i]= av_clip_uint8(val);
 361         }
 362 }
 363
 364 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 365                         const int16_t **lumSrc, int lumFilterSize,
 366                         const int16_t *chrFilter, const int16_t **chrUSrc,
 367                         const int16_t **chrVSrc, int chrFilterSize,
 368                         const int16_t **alpSrc, uint8_t *dest[4],
 369                         int dstW, int chrDstW)
 370 {
 371     uint8_t *yDest = dest[0], *uDest = dest[1];
 372     enum PixelFormat dstFormat = c->dstFormat;
 373
 374     //FIXME Optimize (just quickly written not optimized..)
 375     int i;
 376     for (i=0; i<dstW; i++) {
 377         int val=1<<18;
 378         int j;
 379         for (j=0; j<lumFilterSize; j++)
 380             val += lumSrc[j][i] * lumFilter[j];
 381
 382         yDest[i]= av_clip_uint8(val>>19);
 383     }
 384
 385     if (!uDest)
 386         return;
 387
 388     if (dstFormat == PIX_FMT_NV12)
 389         for (i=0; i<chrDstW; i++) {
 390             int u=1<<18;
 391             int v=1<<18;
 392             int j;
 393             for (j=0; j<chrFilterSize; j++) {
 394                 u += chrUSrc[j][i] * chrFilter[j];
 395                 v += chrVSrc[j][i] * chrFilter[j];
 396             }
 397
 398             uDest[2*i]= av_clip_uint8(u>>19);
 399             uDest[2*i+1]= av_clip_uint8(v>>19);
 400         }
 401     else
 402         for (i=0; i<chrDstW; i++) {
 403             int u=1<<18;
 404             int v=1<<18;
 405             int j;
 406             for (j=0; j<chrFilterSize; j++) {
 407                 u += chrUSrc[j][i] * chrFilter[j];
 408                 v += chrVSrc[j][i] * chrFilter[j];
 409             }
 410
 411             uDest[2*i]= av_clip_uint8(v>>19);
 412             uDest[2*i+1]= av_clip_uint8(u>>19);
 413         }
 414 }
 415
 416 #define output_pixel(pos, val) \
 417         if (target == PIX_FMT_GRAY16BE) { \
 418             AV_WB16(pos, val); \
 419         } else { \
 420             AV_WL16(pos, val); \
 421         }
 422
 423 static av_always_inline void
 424 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 425                         const int32_t **lumSrc, int lumFilterSize,
 426                         const int16_t *chrFilter, const int32_t **chrUSrc,
 427                         const int32_t **chrVSrc, int chrFilterSize,
 428                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 429                         int y, enum PixelFormat target)
 430 {
 431     int i;
 432
 433     for (i = 0; i < (dstW >> 1); i++) {
 434         int j;
 435         int Y1 = 1 << 14;
 436         int Y2 = 1 << 14;
 437
 438         for (j = 0; j < lumFilterSize; j++) {
 439             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 440             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 441         }
 442         Y1 >>= 15;
 443         Y2 >>= 15;
 444         if ((Y1 | Y2) & 0x10000) {
 445             Y1 = av_clip_uint16(Y1);
 446             Y2 = av_clip_uint16(Y2);
 447         }
 448         output_pixel(&dest[i * 2 + 0], Y1);
 449         output_pixel(&dest[i * 2 + 1], Y2);
 450     }
 451 }
 452
 453 static av_always_inline void
 454 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 455                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 456                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 457                         int yalpha, int uvalpha, int y,
 458                         enum PixelFormat target)
 459 {
 460     int  yalpha1 = 4095 - yalpha;
 461     int i;
 462     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 463
 464     for (i = 0; i < (dstW >> 1); i++) {
 465         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 466         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 467
 468         output_pixel(&dest[i * 2 + 0], Y1);
 469         output_pixel(&dest[i * 2 + 1], Y2);
 470     }
 471 }
 472
 473 static av_always_inline void
 474 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 475                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 476                         const int32_t *abuf0, uint16_t *dest, int dstW,
 477                         int uvalpha, int y, enum PixelFormat target)
 478 {
 479     int i;
 480
 481     for (i = 0; i < (dstW >> 1); i++) {
 482         int Y1 = buf0[i * 2    ] << 1;
 483         int Y2 = buf0[i * 2 + 1] << 1;
 484
 485         output_pixel(&dest[i * 2 + 0], Y1);
 486         output_pixel(&dest[i * 2 + 1], Y2);
 487     }
 488 }
 489
 490 #undef output_pixel
 491
 492 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 493 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 494                         const int16_t **_lumSrc, int lumFilterSize, \
 495                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 496                         const int16_t **_chrVSrc, int chrFilterSize, \
 497                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 498                         int y) \
 499 { \
 500     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 501                   **chrUSrc = (const int32_t **) _chrUSrc, \
 502                   **chrVSrc = (const int32_t **) _chrVSrc, \
 503                   **alpSrc  = (const int32_t **) _alpSrc; \
 504     uint16_t *dest = (uint16_t *) _dest; \
 505     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 506                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 507                           alpSrc, dest, dstW, y, fmt); \
 508 } \
 509  \
 510 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 511                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 512                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 513                         int yalpha, int uvalpha, int y) \
 514 { \
 515     const int32_t **buf  = (const int32_t **) _buf, \
 516                   **ubuf = (const int32_t **) _ubuf, \
 517                   **vbuf = (const int32_t **) _vbuf, \
 518                   **abuf = (const int32_t **) _abuf; \
 519     uint16_t *dest = (uint16_t *) _dest; \
 520     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 521                           dest, dstW, yalpha, uvalpha, y, fmt); \
 522 } \
 523  \
 524 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 525                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 526                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 527                         int uvalpha, int y) \
 528 { \
 529     const int32_t *buf0  = (const int32_t *)  _buf0, \
 530                  **ubuf  = (const int32_t **) _ubuf, \
 531                  **vbuf  = (const int32_t **) _vbuf, \
 532                   *abuf0 = (const int32_t *)  _abuf0; \
 533     uint16_t *dest = (uint16_t *) _dest; \
 534     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 535                                   dstW, uvalpha, y, fmt); \
 536 }
 537
 538 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 539 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 540
 541 #define output_pixel(pos, acc) \
 542     if (target == PIX_FMT_MONOBLACK) { \
 543         pos = acc; \
 544     } else { \
 545         pos = ~acc; \
 546     }
 547
 548 static av_always_inline void
 549 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 550                       const int16_t **lumSrc, int lumFilterSize,
 551                       const int16_t *chrFilter, const int16_t **chrUSrc,
 552                       const int16_t **chrVSrc, int chrFilterSize,
 553                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 554                       int y, enum PixelFormat target)
 555 {
 556     const uint8_t * const d128=dither_8x8_220[y&7];
 557     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 558     int i;
 559     int acc = 0;
 560
 561     for (i = 0; i < dstW - 1; i += 2) {
 562         int j;
 563         int Y1 = 1 << 18;
 564         int Y2 = 1 << 18;
 565
 566         for (j = 0; j < lumFilterSize; j++) {
 567             Y1 += lumSrc[j][i]   * lumFilter[j];
 568             Y2 += lumSrc[j][i+1] * lumFilter[j];
 569         }
 570         Y1 >>= 19;
 571         Y2 >>= 19;
 572         if ((Y1 | Y2) & 0x100) {
 573             Y1 = av_clip_uint8(Y1);
 574             Y2 = av_clip_uint8(Y2);
 575         }
 576         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 577         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 578         if ((i & 7) == 6) {
 579             output_pixel(*dest++, acc);
 580         }
 581     }
 582 }
 583
 584 static av_always_inline void
 585 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 586                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 587                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 588                       int yalpha, int uvalpha, int y,
 589                       enum PixelFormat target)
 590 {
 591     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 592     const uint8_t * const d128 = dither_8x8_220[y & 7];
 593     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 594     int  yalpha1 = 4095 - yalpha;
 595     int i;
 596
 597     for (i = 0; i < dstW - 7; i += 8) {
 598         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 599         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 600         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 601         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 602         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 603         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 604         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 605         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 606         output_pixel(*dest++, acc);
 607     }
 608 }
 609
 610 static av_always_inline void
 611 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 612                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 613                       const int16_t *abuf0, uint8_t *dest, int dstW,
 614                       int uvalpha, int y, enum PixelFormat target)
 615 {
 616     const uint8_t * const d128 = dither_8x8_220[y & 7];
 617     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 618     int i;
 619
 620     for (i = 0; i < dstW - 7; i += 8) {
 621         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 622         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 623         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 624         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 625         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 626         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 627         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 628         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 629         output_pixel(*dest++, acc);
 630     }
 631 }
 632
 633 #undef output_pixel
 634
 635 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 636 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 637                                 const int16_t **lumSrc, int lumFilterSize, \
 638                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 639                                 const int16_t **chrVSrc, int chrFilterSize, \
 640                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 641                                 int y) \
 642 { \
 643     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 644                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 645                                   alpSrc, dest, dstW, y, fmt); \
 646 } \
 647  \
 648 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 649                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 650                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 651                                 int yalpha, int uvalpha, int y) \
 652 { \
 653     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 654                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 655 } \
 656  \
 657 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 658                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 659                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 660                                 int uvalpha, int y) \
 661 { \
 662     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 663                                   abuf0, dest, dstW, uvalpha, \
 664                                   y, fmt); \
 665 }
 666
 667 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 668 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 669
 670 #define output_pixels(pos, Y1, U, Y2, V) \
 671     if (target == PIX_FMT_YUYV422) { \
 672         dest[pos + 0] = Y1; \
 673         dest[pos + 1] = U;  \
 674         dest[pos + 2] = Y2; \
 675         dest[pos + 3] = V;  \
 676     } else { \
 677         dest[pos + 0] = U;  \
 678         dest[pos + 1] = Y1; \
 679         dest[pos + 2] = V;  \
 680         dest[pos + 3] = Y2; \
 681     }
 682
 683 static av_always_inline void
 684 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 685                      const int16_t **lumSrc, int lumFilterSize,
 686                      const int16_t *chrFilter, const int16_t **chrUSrc,
 687                      const int16_t **chrVSrc, int chrFilterSize,
 688                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 689                      int y, enum PixelFormat target)
 690 {
 691     int i;
 692
 693     for (i = 0; i < (dstW >> 1); i++) {
 694         int j;
 695         int Y1 = 1 << 18;
 696         int Y2 = 1 << 18;
 697         int U  = 1 << 18;
 698         int V  = 1 << 18;
 699
 700         for (j = 0; j < lumFilterSize; j++) {
 701             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 702             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 703         }
 704         for (j = 0; j < chrFilterSize; j++) {
 705             U += chrUSrc[j][i] * chrFilter[j];
 706             V += chrVSrc[j][i] * chrFilter[j];
 707         }
 708         Y1 >>= 19;
 709         Y2 >>= 19;
 710         U  >>= 19;
 711         V  >>= 19;
 712         if ((Y1 | Y2 | U | V) & 0x100) {
 713             Y1 = av_clip_uint8(Y1);
 714             Y2 = av_clip_uint8(Y2);
 715             U  = av_clip_uint8(U);
 716             V  = av_clip_uint8(V);
 717         }
 718         output_pixels(4*i, Y1, U, Y2, V);
 719     }
 720 }
 721
 722 static av_always_inline void
 723 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 724                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 725                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 726                      int yalpha, int uvalpha, int y,
 727                      enum PixelFormat target)
 728 {
 729     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 730                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 731                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 732     int  yalpha1 = 4095 - yalpha;
 733     int uvalpha1 = 4095 - uvalpha;
 734     int i;
 735
 736     for (i = 0; i < (dstW >> 1); i++) {
 737         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 738         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 739         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 740         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 741
 742         output_pixels(i * 4, Y1, U, Y2, V);
 743     }
 744 }
 745
 746 static av_always_inline void
 747 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 748                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 749                      const int16_t *abuf0, uint8_t *dest, int dstW,
 750                      int uvalpha, int y, enum PixelFormat target)
 751 {
 752     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 753                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 754     int i;
 755
 756     if (uvalpha < 2048) {
 757         for (i = 0; i < (dstW >> 1); i++) {
 758             int Y1 = buf0[i * 2]     >> 7;
 759             int Y2 = buf0[i * 2 + 1] >> 7;
 760             int U  = ubuf1[i]        >> 7;
 761             int V  = vbuf1[i]        >> 7;
 762
 763             output_pixels(i * 4, Y1, U, Y2, V);
 764         }
 765     } else {
 766         for (i = 0; i < (dstW >> 1); i++) {
 767             int Y1 =  buf0[i * 2]          >> 7;
 768             int Y2 =  buf0[i * 2 + 1]      >> 7;
 769             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 770             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 771
 772             output_pixels(i * 4, Y1, U, Y2, V);
 773         }
 774     }
 775 }
 776
 777 #undef output_pixels
 778
 779 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 780 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 781
 782 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 783 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 784 #define output_pixel(pos, val) \
 785     if (isBE(target)) { \
 786         AV_WB16(pos, val); \
 787     } else { \
 788         AV_WL16(pos, val); \
 789     }
 790
 791 static av_always_inline void
 792 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 793                        const int32_t **lumSrc, int lumFilterSize,
 794                        const int16_t *chrFilter, const int32_t **chrUSrc,
 795                        const int32_t **chrVSrc, int chrFilterSize,
 796                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 797                        int y, enum PixelFormat target)
 798 {
 799     int i;
 800
 801     for (i = 0; i < (dstW >> 1); i++) {
 802         int j;
 803         int Y1 = 0;
 804         int Y2 = 0;
 805         int U  = -128 << 23; // 19
 806         int V  = -128 << 23;
 807         int R, G, B;
 808
 809         for (j = 0; j < lumFilterSize; j++) {
 810             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 811             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 812         }
 813         for (j = 0; j < chrFilterSize; j++) {
 814             U += chrUSrc[j][i] * chrFilter[j];
 815             V += chrVSrc[j][i] * chrFilter[j];
 816         }
 817
 818         // 8bit: 12+15=27; 16-bit: 12+19=31
 819         Y1 >>= 14; // 10
 820         Y2 >>= 14;
 821         U  >>= 14;
 822         V  >>= 14;
 823
 824         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 825         Y1 -= c->yuv2rgb_y_offset;
 826         Y2 -= c->yuv2rgb_y_offset;
 827         Y1 *= c->yuv2rgb_y_coeff;
 828         Y2 *= c->yuv2rgb_y_coeff;
 829         Y1 += 1 << 13; // 21
 830         Y2 += 1 << 13;
 831         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 832
 833         R = V * c->yuv2rgb_v2r_coeff;
 834         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 835         B =                            U * c->yuv2rgb_u2b_coeff;
 836
 837         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 838         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 839         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 840         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 841         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 842         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 843         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 844         dest += 6;
 845     }
 846 }
 847
 848 static av_always_inline void
 849 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 850                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 851                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 852                        int yalpha, int uvalpha, int y,
 853                        enum PixelFormat target)
 854 {
 855     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 856                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 857                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 858     int  yalpha1 = 4095 - yalpha;
 859     int uvalpha1 = 4095 - uvalpha;
 860     int i;
 861
 862     for (i = 0; i < (dstW >> 1); i++) {
 863         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 864         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 865         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 866         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 867         int R, G, B;
 868
 869         Y1 -= c->yuv2rgb_y_offset;
 870         Y2 -= c->yuv2rgb_y_offset;
 871         Y1 *= c->yuv2rgb_y_coeff;
 872         Y2 *= c->yuv2rgb_y_coeff;
 873         Y1 += 1 << 13;
 874         Y2 += 1 << 13;
 875
 876         R = V * c->yuv2rgb_v2r_coeff;
 877         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 878         B =                            U * c->yuv2rgb_u2b_coeff;
 879
 880         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 881         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 882         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 883         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 884         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 885         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 886         dest += 6;
 887     }
 888 }
 889
 890 static av_always_inline void
 891 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 892                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 893                        const int32_t *abuf0, uint16_t *dest, int dstW,
 894                        int uvalpha, int y, enum PixelFormat target)
 895 {
 896     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 897                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 898     int i;
 899
 900     if (uvalpha < 2048) {
 901         for (i = 0; i < (dstW >> 1); i++) {
 902             int Y1 = (buf0[i * 2]    ) >> 2;
 903             int Y2 = (buf0[i * 2 + 1]) >> 2;
 904             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 905             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 906             int R, G, B;
 907
 908             Y1 -= c->yuv2rgb_y_offset;
 909             Y2 -= c->yuv2rgb_y_offset;
 910             Y1 *= c->yuv2rgb_y_coeff;
 911             Y2 *= c->yuv2rgb_y_coeff;
 912             Y1 += 1 << 13;
 913             Y2 += 1 << 13;
 914
 915             R = V * c->yuv2rgb_v2r_coeff;
 916             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 917             B =                            U * c->yuv2rgb_u2b_coeff;
 918
 919             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 920             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 921             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 922             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 923             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 924             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 925             dest += 6;
 926         }
 927     } else {
 928         for (i = 0; i < (dstW >> 1); i++) {
 929             int Y1 = (buf0[i * 2]    ) >> 2;
 930             int Y2 = (buf0[i * 2 + 1]) >> 2;
 931             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 11)) >> 3;
 932             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 11)) >> 3;
 933             int R, G, B;
 934
 935             Y1 -= c->yuv2rgb_y_offset;
 936             Y2 -= c->yuv2rgb_y_offset;
 937             Y1 *= c->yuv2rgb_y_coeff;
 938             Y2 *= c->yuv2rgb_y_coeff;
 939             Y1 += 1 << 13;
 940             Y2 += 1 << 13;
 941
 942             R = V * c->yuv2rgb_v2r_coeff;
 943             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 944             B =                            U * c->yuv2rgb_u2b_coeff;
 945
 946             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 947             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 948             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 949             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 950             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 951             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 952             dest += 6;
 953         }
 954     }
 955 }
 956
 957 #undef output_pixel
 958 #undef r_b
 959 #undef b_r
 960
 961 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
 962 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
 963 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
 964 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
 965
 966 static av_always_inline void
 967 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
 968               int U, int V, int A1, int A2,
 969               const void *_r, const void *_g, const void *_b, int y,
 970               enum PixelFormat target, int hasAlpha)
 971 {
 972     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 973         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
 974         uint32_t *dest = (uint32_t *) _dest;
 975         const uint32_t *r = (const uint32_t *) _r;
 976         const uint32_t *g = (const uint32_t *) _g;
 977         const uint32_t *b = (const uint32_t *) _b;
 978
 979 #if CONFIG_SMALL
 980         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 981
 982         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
 983         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
 984 #else
 985         if (hasAlpha) {
 986             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
 987
 988             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
 989             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
 990         } else {
 991             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
 992             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
 993         }
 994 #endif
 995     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
 996         uint8_t *dest = (uint8_t *) _dest;
 997         const uint8_t *r = (const uint8_t *) _r;
 998         const uint8_t *g = (const uint8_t *) _g;
 999         const uint8_t *b = (const uint8_t *) _b;
1000
1001 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1002 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1003         dest[i * 6 + 0] = r_b[Y1];
1004         dest[i * 6 + 1] =   g[Y1];
1005         dest[i * 6 + 2] = b_r[Y1];
1006         dest[i * 6 + 3] = r_b[Y2];
1007         dest[i * 6 + 4] =   g[Y2];
1008         dest[i * 6 + 5] = b_r[Y2];
1009 #undef r_b
1010 #undef b_r
1011     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1012                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1013                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1014         uint16_t *dest = (uint16_t *) _dest;
1015         const uint16_t *r = (const uint16_t *) _r;
1016         const uint16_t *g = (const uint16_t *) _g;
1017         const uint16_t *b = (const uint16_t *) _b;
1018         int dr1, dg1, db1, dr2, dg2, db2;
1019
1020         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1021             dr1 = dither_2x2_8[ y & 1     ][0];
1022             dg1 = dither_2x2_4[ y & 1     ][0];
1023             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1024             dr2 = dither_2x2_8[ y & 1     ][1];
1025             dg2 = dither_2x2_4[ y & 1     ][1];
1026             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1027         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1028             dr1 = dither_2x2_8[ y & 1     ][0];
1029             dg1 = dither_2x2_8[ y & 1     ][1];
1030             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1031             dr2 = dither_2x2_8[ y & 1     ][1];
1032             dg2 = dither_2x2_8[ y & 1     ][0];
1033             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1034         } else {
1035             dr1 = dither_4x4_16[ y & 3     ][0];
1036             dg1 = dither_4x4_16[ y & 3     ][1];
1037             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1038             dr2 = dither_4x4_16[ y & 3     ][1];
1039             dg2 = dither_4x4_16[ y & 3     ][0];
1040             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1041         }
1042
1043         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1044         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1045     } else /* 8/4-bit */ {
1046         uint8_t *dest = (uint8_t *) _dest;
1047         const uint8_t *r = (const uint8_t *) _r;
1048         const uint8_t *g = (const uint8_t *) _g;
1049         const uint8_t *b = (const uint8_t *) _b;
1050         int dr1, dg1, db1, dr2, dg2, db2;
1051
1052         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1053             const uint8_t * const d64 = dither_8x8_73[y & 7];
1054             const uint8_t * const d32 = dither_8x8_32[y & 7];
1055             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1056             db1 =       d64[(i * 2 + 0) & 7];
1057             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1058             db2 =       d64[(i * 2 + 1) & 7];
1059         } else {
1060             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1061             const uint8_t * const d128 = dither_8x8_220[y & 7];
1062             dr1 = db1 = d128[(i * 2 + 0) & 7];
1063             dg1 =        d64[(i * 2 + 0) & 7];
1064             dr2 = db2 = d128[(i * 2 + 1) & 7];
1065             dg2 =        d64[(i * 2 + 1) & 7];
1066         }
1067
1068         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1069             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1070                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1071         } else {
1072             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1073             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1074         }
1075     }
1076 }
1077
1078 static av_always_inline void
1079 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1080                      const int16_t **lumSrc, int lumFilterSize,
1081                      const int16_t *chrFilter, const int16_t **chrUSrc,
1082                      const int16_t **chrVSrc, int chrFilterSize,
1083                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1084                      int y, enum PixelFormat target, int hasAlpha)
1085 {
1086     int i;
1087
1088     for (i = 0; i < (dstW >> 1); i++) {
1089         int j;
1090         int Y1 = 1 << 18;
1091         int Y2 = 1 << 18;
1092         int U  = 1 << 18;
1093         int V  = 1 << 18;
1094         int av_unused A1, A2;
1095         const void *r, *g, *b;
1096
1097         for (j = 0; j < lumFilterSize; j++) {
1098             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1099             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1100         }
1101         for (j = 0; j < chrFilterSize; j++) {
1102             U += chrUSrc[j][i] * chrFilter[j];
1103             V += chrVSrc[j][i] * chrFilter[j];
1104         }
1105         Y1 >>= 19;
1106         Y2 >>= 19;
1107         U  >>= 19;
1108         V  >>= 19;
1109         if ((Y1 | Y2 | U | V) & 0x100) {
1110             Y1 = av_clip_uint8(Y1);
1111             Y2 = av_clip_uint8(Y2);
1112             U  = av_clip_uint8(U);
1113             V  = av_clip_uint8(V);
1114         }
1115         if (hasAlpha) {
1116             A1 = 1 << 18;
1117             A2 = 1 << 18;
1118             for (j = 0; j < lumFilterSize; j++) {
1119                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1120                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1121             }
1122             A1 >>= 19;
1123             A2 >>= 19;
1124             if ((A1 | A2) & 0x100) {
1125                 A1 = av_clip_uint8(A1);
1126                 A2 = av_clip_uint8(A2);
1127             }
1128         }
1129
1130         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1131         r =  c->table_rV[V];
1132         g = (c->table_gU[U] + c->table_gV[V]);
1133         b =  c->table_bU[U];
1134
1135         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1136                       r, g, b, y, target, hasAlpha);
1137     }
1138 }
1139
1140 static av_always_inline void
1141 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1142                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1143                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1144                      int yalpha, int uvalpha, int y,
1145                      enum PixelFormat target, int hasAlpha)
1146 {
1147     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1148                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1149                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1150                   *abuf0 = abuf[0], *abuf1 = abuf[1];
1151     int  yalpha1 = 4095 - yalpha;
1152     int uvalpha1 = 4095 - uvalpha;
1153     int i;
1154
1155     for (i = 0; i < (dstW >> 1); i++) {
1156         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1157         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1158         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1159         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1160         int A1, A2;
1161         const void *r =  c->table_rV[V],
1162                    *g = (c->table_gU[U] + c->table_gV[V]),
1163                    *b =  c->table_bU[U];
1164
1165         if (hasAlpha) {
1166             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1167             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1168         }
1169
1170         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1171                       r, g, b, y, target, hasAlpha);
1172     }
1173 }
1174
1175 static av_always_inline void
1176 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1177                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1178                      const int16_t *abuf0, uint8_t *dest, int dstW,
1179                      int uvalpha, int y, enum PixelFormat target,
1180                      int hasAlpha)
1181 {
1182     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1183                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1184     int i;
1185
1186     if (uvalpha < 2048) {
1187         for (i = 0; i < (dstW >> 1); i++) {
1188             int Y1 = buf0[i * 2]     >> 7;
1189             int Y2 = buf0[i * 2 + 1] >> 7;
1190             int U  = ubuf1[i]        >> 7;
1191             int V  = vbuf1[i]        >> 7;
1192             int A1, A2;
1193             const void *r =  c->table_rV[V],
1194                        *g = (c->table_gU[U] + c->table_gV[V]),
1195                        *b =  c->table_bU[U];
1196
1197             if (hasAlpha) {
1198                 A1 = abuf0[i * 2    ] >> 7;
1199                 A2 = abuf0[i * 2 + 1] >> 7;
1200             }
1201
1202             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1203                           r, g, b, y, target, hasAlpha);
1204         }
1205     } else {
1206         for (i = 0; i < (dstW >> 1); i++) {
1207             int Y1 =  buf0[i * 2]          >> 7;
1208             int Y2 =  buf0[i * 2 + 1]      >> 7;
1209             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1210             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1211             int A1, A2;
1212             const void *r =  c->table_rV[V],
1213                        *g = (c->table_gU[U] + c->table_gV[V]),
1214                        *b =  c->table_bU[U];
1215
1216             if (hasAlpha) {
1217                 A1 = abuf0[i * 2    ] >> 7;
1218                 A2 = abuf0[i * 2 + 1] >> 7;
1219             }
1220
1221             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1222                           r, g, b, y, target, hasAlpha);
1223         }
1224     }
1225 }
1226
1227 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1228 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1229                                 const int16_t **lumSrc, int lumFilterSize, \
1230                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1231                                 const int16_t **chrVSrc, int chrFilterSize, \
1232                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1233                                 int y) \
1234 { \
1235     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1236                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1237                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1238 }
1239 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1240 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1241 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1242                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1243                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1244                                 int yalpha, int uvalpha, int y) \
1245 { \
1246     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1247                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1248 } \
1249  \
1250 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1251                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1252                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1253                                 int uvalpha, int y) \
1254 { \
1255     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1256                                   dstW, uvalpha, y, fmt, hasAlpha); \
1257 }
1258
1259 #if CONFIG_SMALL
1260 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1261 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1262 #else
1263 #if CONFIG_SWSCALE_ALPHA
1264 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1265 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1266 #endif
1267 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1268 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1269 #endif
1270 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1271 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1272 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1273 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1274 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1275 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1276 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1277 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1278
1279 static av_always_inline void
1280 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1281                           const int16_t **lumSrc, int lumFilterSize,
1282                           const int16_t *chrFilter, const int16_t **chrUSrc,
1283                           const int16_t **chrVSrc, int chrFilterSize,
1284                           const int16_t **alpSrc, uint8_t *dest,
1285                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1286 {
1287     int i;
1288     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1289
1290     for (i = 0; i < dstW; i++) {
1291         int j;
1292         int Y = 0;
1293         int U = -128 << 19;
1294         int V = -128 << 19;
1295         int av_unused A;
1296         int R, G, B;
1297
1298         for (j = 0; j < lumFilterSize; j++) {
1299             Y += lumSrc[j][i] * lumFilter[j];
1300         }
1301         for (j = 0; j < chrFilterSize; j++) {
1302             U += chrUSrc[j][i] * chrFilter[j];
1303             V += chrVSrc[j][i] * chrFilter[j];
1304         }
1305         Y >>= 10;
1306         U >>= 10;
1307         V >>= 10;
1308         if (hasAlpha) {
1309             A = 1 << 21;
1310             for (j = 0; j < lumFilterSize; j++) {
1311                 A += alpSrc[j][i] * lumFilter[j];
1312             }
1313             A >>= 19;
1314             if (A & 0x100)
1315                 A = av_clip_uint8(A);
1316         }
1317         Y -= c->yuv2rgb_y_offset;
1318         Y *= c->yuv2rgb_y_coeff;
1319         Y += 1 << 21;
1320         R = Y + V*c->yuv2rgb_v2r_coeff;
1321         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1322         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1323         if ((R | G | B) & 0xC0000000) {
1324             R = av_clip_uintp2(R, 30);
1325             G = av_clip_uintp2(G, 30);
1326             B = av_clip_uintp2(B, 30);
1327         }
1328
1329         switch(target) {
1330         case PIX_FMT_ARGB:
1331             dest[0] = hasAlpha ? A : 255;
1332             dest[1] = R >> 22;
1333             dest[2] = G >> 22;
1334             dest[3] = B >> 22;
1335             break;
1336         case PIX_FMT_RGB24:
1337             dest[0] = R >> 22;
1338             dest[1] = G >> 22;
1339             dest[2] = B >> 22;
1340             break;
1341         case PIX_FMT_RGBA:
1342             dest[0] = R >> 22;
1343             dest[1] = G >> 22;
1344             dest[2] = B >> 22;
1345             dest[3] = hasAlpha ? A : 255;
1346             break;
1347         case PIX_FMT_ABGR:
1348             dest[0] = hasAlpha ? A : 255;
1349             dest[1] = B >> 22;
1350             dest[2] = G >> 22;
1351             dest[3] = R >> 22;
1352             dest += 4;
1353             break;
1354         case PIX_FMT_BGR24:
1355             dest[0] = B >> 22;
1356             dest[1] = G >> 22;
1357             dest[2] = R >> 22;
1358             break;
1359         case PIX_FMT_BGRA:
1360             dest[0] = B >> 22;
1361             dest[1] = G >> 22;
1362             dest[2] = R >> 22;
1363             dest[3] = hasAlpha ? A : 255;
1364             break;
1365         }
1366         dest += step;
1367     }
1368 }
1369
1370 #if CONFIG_SMALL
1371 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1372 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1373 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1374 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1375 #else
1376 #if CONFIG_SWSCALE_ALPHA
1377 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1378 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1379 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1380 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1381 #endif
1382 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1383 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1384 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1385 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1386 #endif
1387 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1388 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1389
1390 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1391                                        int width, int height,
1392                                        int y, uint8_t val)
1393 {
1394     int i;
1395     uint8_t *ptr = plane + stride*y;
1396     for (i=0; i<height; i++) {
1397         memset(ptr, val, width);
1398         ptr += stride;
1399     }
1400 }
1401
1402 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1403
1404 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1405 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1406
1407 static av_always_inline void
1408 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1409                     enum PixelFormat origin)
1410 {
1411     int i;
1412     for (i = 0; i < width; i++) {
1413         unsigned int r_b = input_pixel(&src[i*3+0]);
1414         unsigned int   g = input_pixel(&src[i*3+1]);
1415         unsigned int b_r = input_pixel(&src[i*3+2]);
1416
1417         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1418     }
1419 }
1420
1421 static av_always_inline void
1422 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1423                     const uint16_t *src1, const uint16_t *src2,
1424                     int width, enum PixelFormat origin)
1425 {
1426     int i;
1427     assert(src1==src2);
1428     for (i = 0; i < width; i++) {
1429         int r_b = input_pixel(&src1[i*3+0]);
1430         int   g = input_pixel(&src1[i*3+1]);
1431         int b_r = input_pixel(&src1[i*3+2]);
1432
1433         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1434         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1435     }
1436 }
1437
1438 static av_always_inline void
1439 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1440                           const uint16_t *src1, const uint16_t *src2,
1441                           int width, enum PixelFormat origin)
1442 {
1443     int i;
1444     assert(src1==src2);
1445     for (i = 0; i < width; i++) {
1446         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1447         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1448         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1449
1450         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1451         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1452     }
1453 }
1454
1455 #undef r
1456 #undef b
1457 #undef input_pixel
1458
1459 #define rgb48funcs(pattern, BE_LE, origin) \
1460 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1461                                     int width, uint32_t *unused) \
1462 { \
1463     const uint16_t *src = (const uint16_t *) _src; \
1464     uint16_t *dst = (uint16_t *) _dst; \
1465     rgb48ToY_c_template(dst, src, width, origin); \
1466 } \
1467  \
1468 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1469                                     const uint8_t *_src1, const uint8_t *_src2, \
1470                                     int width, uint32_t *unused) \
1471 { \
1472     const uint16_t *src1 = (const uint16_t *) _src1, \
1473                    *src2 = (const uint16_t *) _src2; \
1474     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1475     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1476 } \
1477  \
1478 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1479                                     const uint8_t *_src1, const uint8_t *_src2, \
1480                                     int width, uint32_t *unused) \
1481 { \
1482     const uint16_t *src1 = (const uint16_t *) _src1, \
1483                    *src2 = (const uint16_t *) _src2; \
1484     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1485     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1486 }
1487
1488 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1489 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1490 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1491 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1492
1493 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1494                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1495                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1496
1497 static av_always_inline void
1498 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1499                        int width, enum PixelFormat origin,
1500                        int shr,   int shg,   int shb, int shp,
1501                        int maskr, int maskg, int maskb,
1502                        int rsh,   int gsh,   int bsh, int S)
1503 {
1504     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1505               rnd = 33 << (S - 1);
1506     int i;
1507
1508     for (i = 0; i < width; i++) {
1509         int px = input_pixel(i) >> shp;
1510         int b = (px & maskb) >> shb;
1511         int g = (px & maskg) >> shg;
1512         int r = (px & maskr) >> shr;
1513
1514         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1515     }
1516 }
1517
1518 static av_always_inline void
1519 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1520                         const uint8_t *src, int width,
1521                         enum PixelFormat origin,
1522                         int shr,   int shg,   int shb, int shp,
1523                         int maskr, int maskg, int maskb,
1524                         int rsh,   int gsh,   int bsh, int S)
1525 {
1526     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1527               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1528               rnd = 257 << (S - 1);
1529     int i;
1530
1531     for (i = 0; i < width; i++) {
1532         int px = input_pixel(i) >> shp;
1533         int b = (px & maskb) >> shb;
1534         int g = (px & maskg) >> shg;
1535         int r = (px & maskr) >> shr;
1536
1537         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1538         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1539     }
1540 }
1541
1542 static av_always_inline void
1543 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1544                              const uint8_t *src, int width,
1545                              enum PixelFormat origin,
1546                              int shr,   int shg,   int shb, int shp,
1547                              int maskr, int maskg, int maskb,
1548                              int rsh,   int gsh,   int bsh, int S)
1549 {
1550     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1551               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1552               rnd = 257 << S, maskgx = ~(maskr | maskb);
1553     int i;
1554
1555     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1556     for (i = 0; i < width; i++) {
1557         int px0 = input_pixel(2 * i + 0) >> shp;
1558         int px1 = input_pixel(2 * i + 1) >> shp;
1559         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1560         int rb = px0 + px1 - g;
1561
1562         b = (rb & maskb) >> shb;
1563         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1564             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1565             g >>= shg;
1566         } else {
1567             g = (g  & maskg) >> shg;
1568         }
1569         r = (rb & maskr) >> shr;
1570
1571         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1572         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1573     }
1574 }
1575
1576 #undef input_pixel
1577
1578 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1579                          maskg, maskb, rsh, gsh, bsh, S) \
1580 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1581                           int width, uint32_t *unused) \
1582 { \
1583     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1584                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1585 } \
1586  \
1587 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1588                            const uint8_t *src, const uint8_t *dummy, \
1589                            int width, uint32_t *unused) \
1590 { \
1591     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1592                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1593 } \
1594  \
1595 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1596                                 const uint8_t *src, const uint8_t *dummy, \
1597                                 int width, uint32_t *unused) \
1598 { \
1599     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1600                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1601 }
1602
1603 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1604 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1605 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1606 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1607 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1608 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1609 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1610 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1611 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1612 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1613 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1614 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1615
1616 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1617 {
1618     int i;
1619     for (i=0; i<width; i++) {
1620         dst[i]= src[4*i];
1621     }
1622 }
1623
1624 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1625 {
1626     int i;
1627     for (i=0; i<width; i++) {
1628         dst[i]= src[4*i+3];
1629     }
1630 }
1631
1632 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1633 {
1634     int i;
1635     for (i=0; i<width; i++) {
1636         int d= src[i];
1637
1638         dst[i]= pal[d] & 0xFF;
1639     }
1640 }
1641
1642 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1643                       const uint8_t *src1, const uint8_t *src2,
1644                       int width, uint32_t *pal)
1645 {
1646     int i;
1647     assert(src1 == src2);
1648     for (i=0; i<width; i++) {
1649         int p= pal[src1[i]];
1650
1651         dstU[i]= p>>8;
1652         dstV[i]= p>>16;
1653     }
1654 }
1655
1656 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1657                           int width, uint32_t *unused)
1658 {
1659     int i, j;
1660     for (i=0; i<width/8; i++) {
1661         int d= ~src[i];
1662         for(j=0; j<8; j++)
1663             dst[8*i+j]= ((d>>(7-j))&1)*255;
1664     }
1665 }
1666
1667 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1668                           int width, uint32_t *unused)
1669 {
1670     int i, j;
1671     for (i=0; i<width/8; i++) {
1672         int d= src[i];
1673         for(j=0; j<8; j++)
1674             dst[8*i+j]= ((d>>(7-j))&1)*255;
1675     }
1676 }
1677
1678 //FIXME yuy2* can read up to 7 samples too much
1679
1680 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1681                       uint32_t *unused)
1682 {
1683     int i;
1684     for (i=0; i<width; i++)
1685         dst[i]= src[2*i];
1686 }
1687
1688 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1689                        const uint8_t *src2, int width, uint32_t *unused)
1690 {
1691     int i;
1692     for (i=0; i<width; i++) {
1693         dstU[i]= src1[4*i + 1];
1694         dstV[i]= src1[4*i + 3];
1695     }
1696     assert(src1 == src2);
1697 }
1698
1699 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1700 {
1701     int i;
1702     const uint16_t *src = (const uint16_t *) _src;
1703     uint16_t *dst = (uint16_t *) _dst;
1704     for (i=0; i<width; i++) {
1705         dst[i] = av_bswap16(src[i]);
1706     }
1707 }
1708
1709 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1710                         const uint8_t *_src2, int width, uint32_t *unused)
1711 {
1712     int i;
1713     const uint16_t *src1 = (const uint16_t *) _src1,
1714                    *src2 = (const uint16_t *) _src2;
1715     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1716     for (i=0; i<width; i++) {
1717         dstU[i] = av_bswap16(src1[i]);
1718         dstV[i] = av_bswap16(src2[i]);
1719     }
1720 }
1721
1722 /* This is almost identical to the previous, end exists only because
1723  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1724 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1725                       uint32_t *unused)
1726 {
1727     int i;
1728     for (i=0; i<width; i++)
1729         dst[i]= src[2*i+1];
1730 }
1731
1732 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1733                        const uint8_t *src2, int width, uint32_t *unused)
1734 {
1735     int i;
1736     for (i=0; i<width; i++) {
1737         dstU[i]= src1[4*i + 0];
1738         dstV[i]= src1[4*i + 2];
1739     }
1740     assert(src1 == src2);
1741 }
1742
1743 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1744                                         const uint8_t *src, int width)
1745 {
1746     int i;
1747     for (i = 0; i < width; i++) {
1748         dst1[i] = src[2*i+0];
1749         dst2[i] = src[2*i+1];
1750     }
1751 }
1752
1753 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1754                        const uint8_t *src1, const uint8_t *src2,
1755                        int width, uint32_t *unused)
1756 {
1757     nvXXtoUV_c(dstU, dstV, src1, width);
1758 }
1759
1760 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1761                        const uint8_t *src1, const uint8_t *src2,
1762                        int width, uint32_t *unused)
1763 {
1764     nvXXtoUV_c(dstV, dstU, src1, width);
1765 }
1766
1767 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1768
1769 // FIXME Maybe dither instead.
1770 static av_always_inline void
1771 yuv9_OR_10ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1772                           const uint16_t *srcU, const uint16_t *srcV,
1773                           int width, enum PixelFormat origin, int depth)
1774 {
1775     int i;
1776
1777     for (i = 0; i < width; i++) {
1778         int upx = input_pixel(&srcU[i]);
1779         int vpx = input_pixel(&srcV[i]);
1780         dstU[i] =  (upx << (16 - depth)) | (upx >> (2 * depth - 16));
1781         dstV[i] =  (vpx << (16 - depth)) | (vpx >> (2 * depth - 16));
1782     }
1783 }
1784
1785 static av_always_inline void
1786 yuv9_or_10ToY_c_template(uint16_t *dstY, const uint16_t *srcY,
1787                          int width, enum PixelFormat origin, int depth)
1788 {
1789     int i;
1790
1791     for (i = 0; i < width; i++) {
1792         int px = input_pixel(&srcY[i]);
1793         dstY[i] =  (px << (16 - depth)) | (px >> (2 * depth - 16));
1794     }
1795 }
1796
1797 #undef input_pixel
1798
1799 #define YUV_NBPS(depth, BE_LE, origin) \
1800 static void BE_LE ## depth ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1801                                      const uint8_t *_srcU, const uint8_t *_srcV, \
1802                                      int width, uint32_t *unused) \
1803 { \
1804     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1805     const uint16_t *srcU = (const uint16_t *) _srcU, \
1806                    *srcV = (const uint16_t *) _srcV; \
1807     yuv9_OR_10ToUV_c_template(dstU, dstV, srcU, srcV, width, origin, depth); \
1808 } \
1809 static void BE_LE ## depth ## ToY_c(uint8_t *_dstY, const uint8_t *_srcY, \
1810                                     int width, uint32_t *unused) \
1811 { \
1812     uint16_t *dstY = (uint16_t *) _dstY; \
1813     const uint16_t *srcY = (const uint16_t *) _srcY; \
1814     yuv9_or_10ToY_c_template(dstY, srcY, width, origin, depth); \
1815 }
1816
1817 YUV_NBPS( 9, LE, PIX_FMT_YUV420P9LE);
1818 YUV_NBPS( 9, BE, PIX_FMT_YUV420P9BE);
1819 YUV_NBPS(10, LE, PIX_FMT_YUV420P10LE);
1820 YUV_NBPS(10, BE, PIX_FMT_YUV420P10BE);
1821
1822 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1823                        int width, uint32_t *unused)
1824 {
1825     int i;
1826     for (i=0; i<width; i++) {
1827         int b= src[i*3+0];
1828         int g= src[i*3+1];
1829         int r= src[i*3+2];
1830
1831         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1832     }
1833 }
1834
1835 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1836                         const uint8_t *src2, int width, uint32_t *unused)
1837 {
1838     int i;
1839     for (i=0; i<width; i++) {
1840         int b= src1[3*i + 0];
1841         int g= src1[3*i + 1];
1842         int r= src1[3*i + 2];
1843
1844         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1845         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1846     }
1847     assert(src1 == src2);
1848 }
1849
1850 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1851                              const uint8_t *src2, int width, uint32_t *unused)
1852 {
1853     int i;
1854     for (i=0; i<width; i++) {
1855         int b= src1[6*i + 0] + src1[6*i + 3];
1856         int g= src1[6*i + 1] + src1[6*i + 4];
1857         int r= src1[6*i + 2] + src1[6*i + 5];
1858
1859         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1860         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1861     }
1862     assert(src1 == src2);
1863 }
1864
1865 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1866                        uint32_t *unused)
1867 {
1868     int i;
1869     for (i=0; i<width; i++) {
1870         int r= src[i*3+0];
1871         int g= src[i*3+1];
1872         int b= src[i*3+2];
1873
1874         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1875     }
1876 }
1877
1878 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1879                         const uint8_t *src2, int width, uint32_t *unused)
1880 {
1881     int i;
1882     assert(src1==src2);
1883     for (i=0; i<width; i++) {
1884         int r= src1[3*i + 0];
1885         int g= src1[3*i + 1];
1886         int b= src1[3*i + 2];
1887
1888         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1889         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1890     }
1891 }
1892
1893 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1894                              const uint8_t *src2, int width, uint32_t *unused)
1895 {
1896     int i;
1897     assert(src1==src2);
1898     for (i=0; i<width; i++) {
1899         int r= src1[6*i + 0] + src1[6*i + 3];
1900         int g= src1[6*i + 1] + src1[6*i + 4];
1901         int b= src1[6*i + 2] + src1[6*i + 5];
1902
1903         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1904         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1905     }
1906 }
1907
1908 static void hScale16_c(int16_t *_dst, int dstW, const uint8_t *_src,
1909                        const int16_t *filter,
1910                        const int16_t *filterPos, int filterSize)
1911 {
1912     int i;
1913     int32_t *dst = (int32_t *) _dst;
1914     const uint16_t *src = (const uint16_t *) _src;
1915
1916     for (i = 0; i < dstW; i++) {
1917         int j;
1918         int srcPos = filterPos[i];
1919         unsigned int val = 0;
1920
1921         for (j = 0; j < filterSize; j++) {
1922             val += src[srcPos + j] * filter[filterSize * i + j];
1923         }
1924         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1925         dst[i] = FFMIN(val >> 11, (1 << 19) - 1);
1926     }
1927 }
1928
1929 // bilinear / bicubic scaling
1930 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1931                      const int16_t *filter, const int16_t *filterPos,
1932                      int filterSize)
1933 {
1934     int i;
1935     for (i=0; i<dstW; i++) {
1936         int j;
1937         int srcPos= filterPos[i];
1938         int val=0;
1939         for (j=0; j<filterSize; j++) {
1940             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1941         }
1942         //filter += hFilterSize;
1943         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1944         //dst[i] = val>>7;
1945     }
1946 }
1947
1948 //FIXME all pal and rgb srcFormats could do this convertion as well
1949 //FIXME all scalers more complex than bilinear could do half of this transform
1950 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1951 {
1952     int i;
1953     for (i = 0; i < width; i++) {
1954         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1955         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1956     }
1957 }
1958 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1959 {
1960     int i;
1961     for (i = 0; i < width; i++) {
1962         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1963         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1964     }
1965 }
1966 static void lumRangeToJpeg_c(int16_t *dst, int width)
1967 {
1968     int i;
1969     for (i = 0; i < width; i++)
1970         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1971 }
1972 static void lumRangeFromJpeg_c(int16_t *dst, int width)
1973 {
1974     int i;
1975     for (i = 0; i < width; i++)
1976         dst[i] = (dst[i]*14071 + 33561947)>>14;
1977 }
1978
1979 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
1980 {
1981     int i;
1982     int32_t *dstU = (int32_t *) _dstU;
1983     int32_t *dstV = (int32_t *) _dstV;
1984     for (i = 0; i < width; i++) {
1985         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
1986         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
1987     }
1988 }
1989 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
1990 {
1991     int i;
1992     int32_t *dstU = (int32_t *) _dstU;
1993     int32_t *dstV = (int32_t *) _dstV;
1994     for (i = 0; i < width; i++) {
1995         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
1996         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
1997     }
1998 }
1999 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2000 {
2001     int i;
2002     int32_t *dst = (int32_t *) _dst;
2003     for (i = 0; i < width; i++)
2004         dst[i] = (FFMIN(dst[i],30189<<4)*19077 - (39057361<<4))>>14;
2005 }
2006 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2007 {
2008     int i;
2009     int32_t *dst = (int32_t *) _dst;
2010     for (i = 0; i < width; i++)
2011         dst[i] = (dst[i]*14071 + (33561947<<4))>>14;
2012 }
2013
2014 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2015                            const uint8_t *src, int srcW, int xInc)
2016 {
2017     int i;
2018     unsigned int xpos=0;
2019     for (i=0;i<dstWidth;i++) {
2020         register unsigned int xx=xpos>>16;
2021         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2022         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2023         xpos+=xInc;
2024     }
2025 }
2026
2027 static void scale8To16Rv_c(uint16_t *_dst, const uint8_t *src, int len)
2028 {
2029     int i;
2030     uint8_t *dst = (uint8_t *) _dst;
2031     for (i = len - 1; i >= 0; i--) {
2032         dst[i * 2] = dst[i * 2 + 1] = src[i];
2033     }
2034 }
2035
2036 static void scale19To15Fw_c(int16_t *dst, const int32_t *src, int len)
2037 {
2038     int i;
2039     for (i = 0; i < len; i++) {
2040         dst[i] = src[i] >> 4;
2041     }
2042 }
2043
2044 // *** horizontal scale Y line to temp buffer
2045 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2046                                      const uint8_t *src, int srcW, int xInc,
2047                                      const int16_t *hLumFilter,
2048                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2049                                      uint8_t *formatConvBuffer,
2050                                      uint32_t *pal, int isAlpha)
2051 {
2052     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2053     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2054
2055     if (toYV12) {
2056         toYV12(formatConvBuffer, src, srcW, pal);
2057         src= formatConvBuffer;
2058     }
2059
2060     if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2061         c->scale8To16Rv((uint16_t *) formatConvBuffer, src, srcW);
2062         src = formatConvBuffer;
2063     }
2064
2065     if (!c->hyscale_fast) {
2066         c->hScale(dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2067     } else { // fast bilinear upscale / crap downscale
2068         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2069     }
2070
2071     if (convertRange)
2072         convertRange(dst, dstWidth);
2073
2074     if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2075         c->scale19To15Fw(dst, (int32_t *) dst, dstWidth);
2076     }
2077 }
2078
2079 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2080                            int dstWidth, const uint8_t *src1,
2081                            const uint8_t *src2, int srcW, int xInc)
2082 {
2083     int i;
2084     unsigned int xpos=0;
2085     for (i=0;i<dstWidth;i++) {
2086         register unsigned int xx=xpos>>16;
2087         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2088         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2089         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2090         xpos+=xInc;
2091     }
2092 }
2093
2094 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2095                                      const uint8_t *src1, const uint8_t *src2,
2096                                      int srcW, int xInc, const int16_t *hChrFilter,
2097                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2098                                      uint8_t *formatConvBuffer, uint32_t *pal)
2099 {
2100     if (c->chrToYV12) {
2101         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW * c->scalingBpp >> 3, 16);
2102         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2103         src1= formatConvBuffer;
2104         src2= buf2;
2105     }
2106
2107     if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2108         uint8_t *buf2 = (formatConvBuffer + FFALIGN(srcW * 2, 16));
2109         c->scale8To16Rv((uint16_t *) formatConvBuffer, src1, srcW);
2110         c->scale8To16Rv((uint16_t *) buf2,             src2, srcW);
2111         src1 = formatConvBuffer;
2112         src2 = buf2;
2113     }
2114
2115     if (!c->hcscale_fast) {
2116         c->hScale(dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2117         c->hScale(dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2118     } else { // fast bilinear upscale / crap downscale
2119         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2120     }
2121
2122     if (c->chrConvertRange)
2123         c->chrConvertRange(dst1, dst2, dstWidth);
2124
2125     if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2126         c->scale19To15Fw(dst1, (int32_t *) dst1, dstWidth);
2127         c->scale19To15Fw(dst2, (int32_t *) dst2, dstWidth);
2128     }
2129 }
2130
2131 static av_always_inline void
2132 find_c_packed_planar_out_funcs(SwsContext *c,
2133                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2134                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2135                                yuv2packedX_fn *yuv2packedX)
2136 {
2137     enum PixelFormat dstFormat = c->dstFormat;
2138
2139     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2140         *yuv2yuvX     = yuv2nv12X_c;
2141     } else if (is16BPS(dstFormat)) {
2142         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2143     } else if (is9_OR_10BPS(dstFormat)) {
2144         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2145             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2146         } else {
2147             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2148         }
2149     } else {
2150         *yuv2yuv1     = yuv2yuv1_c;
2151         *yuv2yuvX     = yuv2yuvX_c;
2152     }
2153     if(c->flags & SWS_FULL_CHR_H_INT) {
2154         switch (dstFormat) {
2155             case PIX_FMT_RGBA:
2156 #if CONFIG_SMALL
2157                 *yuv2packedX = yuv2rgba32_full_X_c;
2158 #else
2159 #if CONFIG_SWSCALE_ALPHA
2160                 if (c->alpPixBuf) {
2161                     *yuv2packedX = yuv2rgba32_full_X_c;
2162                 } else
2163 #endif /* CONFIG_SWSCALE_ALPHA */
2164                 {
2165                     *yuv2packedX = yuv2rgbx32_full_X_c;
2166                 }
2167 #endif /* !CONFIG_SMALL */
2168                 break;
2169             case PIX_FMT_ARGB:
2170 #if CONFIG_SMALL
2171                 *yuv2packedX = yuv2argb32_full_X_c;
2172 #else
2173 #if CONFIG_SWSCALE_ALPHA
2174                 if (c->alpPixBuf) {
2175                     *yuv2packedX = yuv2argb32_full_X_c;
2176                 } else
2177 #endif /* CONFIG_SWSCALE_ALPHA */
2178                 {
2179                     *yuv2packedX = yuv2xrgb32_full_X_c;
2180                 }
2181 #endif /* !CONFIG_SMALL */
2182                 break;
2183             case PIX_FMT_BGRA:
2184 #if CONFIG_SMALL
2185                 *yuv2packedX = yuv2bgra32_full_X_c;
2186 #else
2187 #if CONFIG_SWSCALE_ALPHA
2188                 if (c->alpPixBuf) {
2189                     *yuv2packedX = yuv2bgra32_full_X_c;
2190                 } else
2191 #endif /* CONFIG_SWSCALE_ALPHA */
2192                 {
2193                     *yuv2packedX = yuv2bgrx32_full_X_c;
2194                 }
2195 #endif /* !CONFIG_SMALL */
2196                 break;
2197             case PIX_FMT_ABGR:
2198 #if CONFIG_SMALL
2199                 *yuv2packedX = yuv2abgr32_full_X_c;
2200 #else
2201 #if CONFIG_SWSCALE_ALPHA
2202                 if (c->alpPixBuf) {
2203                     *yuv2packedX = yuv2abgr32_full_X_c;
2204                 } else
2205 #endif /* CONFIG_SWSCALE_ALPHA */
2206                 {
2207                     *yuv2packedX = yuv2xbgr32_full_X_c;
2208                 }
2209 #endif /* !CONFIG_SMALL */
2210                 break;
2211             case PIX_FMT_RGB24:
2212             *yuv2packedX = yuv2rgb24_full_X_c;
2213             break;
2214         case PIX_FMT_BGR24:
2215             *yuv2packedX = yuv2bgr24_full_X_c;
2216             break;
2217         }
2218     } else {
2219         switch (dstFormat) {
2220         case PIX_FMT_GRAY16BE:
2221             *yuv2packed1 = yuv2gray16BE_1_c;
2222             *yuv2packed2 = yuv2gray16BE_2_c;
2223             *yuv2packedX = yuv2gray16BE_X_c;
2224             break;
2225         case PIX_FMT_GRAY16LE:
2226             *yuv2packed1 = yuv2gray16LE_1_c;
2227             *yuv2packed2 = yuv2gray16LE_2_c;
2228             *yuv2packedX = yuv2gray16LE_X_c;
2229             break;
2230         case PIX_FMT_MONOWHITE:
2231             *yuv2packed1 = yuv2monowhite_1_c;
2232             *yuv2packed2 = yuv2monowhite_2_c;
2233             *yuv2packedX = yuv2monowhite_X_c;
2234             break;
2235         case PIX_FMT_MONOBLACK:
2236             *yuv2packed1 = yuv2monoblack_1_c;
2237             *yuv2packed2 = yuv2monoblack_2_c;
2238             *yuv2packedX = yuv2monoblack_X_c;
2239             break;
2240         case PIX_FMT_YUYV422:
2241             *yuv2packed1 = yuv2yuyv422_1_c;
2242             *yuv2packed2 = yuv2yuyv422_2_c;
2243             *yuv2packedX = yuv2yuyv422_X_c;
2244             break;
2245         case PIX_FMT_UYVY422:
2246             *yuv2packed1 = yuv2uyvy422_1_c;
2247             *yuv2packed2 = yuv2uyvy422_2_c;
2248             *yuv2packedX = yuv2uyvy422_X_c;
2249             break;
2250         case PIX_FMT_RGB48LE:
2251             *yuv2packed1 = yuv2rgb48le_1_c;
2252             *yuv2packed2 = yuv2rgb48le_2_c;
2253             *yuv2packedX = yuv2rgb48le_X_c;
2254             break;
2255         case PIX_FMT_RGB48BE:
2256             *yuv2packed1 = yuv2rgb48be_1_c;
2257             *yuv2packed2 = yuv2rgb48be_2_c;
2258             *yuv2packedX = yuv2rgb48be_X_c;
2259             break;
2260         case PIX_FMT_BGR48LE:
2261             *yuv2packed1 = yuv2bgr48le_1_c;
2262             *yuv2packed2 = yuv2bgr48le_2_c;
2263             *yuv2packedX = yuv2bgr48le_X_c;
2264             break;
2265         case PIX_FMT_BGR48BE:
2266             *yuv2packed1 = yuv2bgr48be_1_c;
2267             *yuv2packed2 = yuv2bgr48be_2_c;
2268             *yuv2packedX = yuv2bgr48be_X_c;
2269             break;
2270         case PIX_FMT_RGB32:
2271         case PIX_FMT_BGR32:
2272 #if CONFIG_SMALL
2273             *yuv2packed1 = yuv2rgb32_1_c;
2274             *yuv2packed2 = yuv2rgb32_2_c;
2275             *yuv2packedX = yuv2rgb32_X_c;
2276 #else
2277 #if CONFIG_SWSCALE_ALPHA
2278                 if (c->alpPixBuf) {
2279                     *yuv2packed1 = yuv2rgba32_1_c;
2280                     *yuv2packed2 = yuv2rgba32_2_c;
2281                     *yuv2packedX = yuv2rgba32_X_c;
2282                 } else
2283 #endif /* CONFIG_SWSCALE_ALPHA */
2284                 {
2285                     *yuv2packed1 = yuv2rgbx32_1_c;
2286                     *yuv2packed2 = yuv2rgbx32_2_c;
2287                     *yuv2packedX = yuv2rgbx32_X_c;
2288                 }
2289 #endif /* !CONFIG_SMALL */
2290             break;
2291         case PIX_FMT_RGB32_1:
2292         case PIX_FMT_BGR32_1:
2293 #if CONFIG_SMALL
2294                 *yuv2packed1 = yuv2rgb32_1_1_c;
2295                 *yuv2packed2 = yuv2rgb32_1_2_c;
2296                 *yuv2packedX = yuv2rgb32_1_X_c;
2297 #else
2298 #if CONFIG_SWSCALE_ALPHA
2299                 if (c->alpPixBuf) {
2300                     *yuv2packed1 = yuv2rgba32_1_1_c;
2301                     *yuv2packed2 = yuv2rgba32_1_2_c;
2302                     *yuv2packedX = yuv2rgba32_1_X_c;
2303                 } else
2304 #endif /* CONFIG_SWSCALE_ALPHA */
2305                 {
2306                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2307                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2308                     *yuv2packedX = yuv2rgbx32_1_X_c;
2309                 }
2310 #endif /* !CONFIG_SMALL */
2311                 break;
2312         case PIX_FMT_RGB24:
2313             *yuv2packed1 = yuv2rgb24_1_c;
2314             *yuv2packed2 = yuv2rgb24_2_c;
2315             *yuv2packedX = yuv2rgb24_X_c;
2316             break;
2317         case PIX_FMT_BGR24:
2318             *yuv2packed1 = yuv2bgr24_1_c;
2319             *yuv2packed2 = yuv2bgr24_2_c;
2320             *yuv2packedX = yuv2bgr24_X_c;
2321             break;
2322         case PIX_FMT_RGB565LE:
2323         case PIX_FMT_RGB565BE:
2324         case PIX_FMT_BGR565LE:
2325         case PIX_FMT_BGR565BE:
2326             *yuv2packed1 = yuv2rgb16_1_c;
2327             *yuv2packed2 = yuv2rgb16_2_c;
2328             *yuv2packedX = yuv2rgb16_X_c;
2329             break;
2330         case PIX_FMT_RGB555LE:
2331         case PIX_FMT_RGB555BE:
2332         case PIX_FMT_BGR555LE:
2333         case PIX_FMT_BGR555BE:
2334             *yuv2packed1 = yuv2rgb15_1_c;
2335             *yuv2packed2 = yuv2rgb15_2_c;
2336             *yuv2packedX = yuv2rgb15_X_c;
2337             break;
2338         case PIX_FMT_RGB444LE:
2339         case PIX_FMT_RGB444BE:
2340         case PIX_FMT_BGR444LE:
2341         case PIX_FMT_BGR444BE:
2342             *yuv2packed1 = yuv2rgb12_1_c;
2343             *yuv2packed2 = yuv2rgb12_2_c;
2344             *yuv2packedX = yuv2rgb12_X_c;
2345             break;
2346         case PIX_FMT_RGB8:
2347         case PIX_FMT_BGR8:
2348             *yuv2packed1 = yuv2rgb8_1_c;
2349             *yuv2packed2 = yuv2rgb8_2_c;
2350             *yuv2packedX = yuv2rgb8_X_c;
2351             break;
2352         case PIX_FMT_RGB4:
2353         case PIX_FMT_BGR4:
2354             *yuv2packed1 = yuv2rgb4_1_c;
2355             *yuv2packed2 = yuv2rgb4_2_c;
2356             *yuv2packedX = yuv2rgb4_X_c;
2357             break;
2358         case PIX_FMT_RGB4_BYTE:
2359         case PIX_FMT_BGR4_BYTE:
2360             *yuv2packed1 = yuv2rgb4b_1_c;
2361             *yuv2packed2 = yuv2rgb4b_2_c;
2362             *yuv2packedX = yuv2rgb4b_X_c;
2363             break;
2364         }
2365     }
2366 }
2367
2368 #define DEBUG_SWSCALE_BUFFERS 0
2369 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2370
2371 static int swScale(SwsContext *c, const uint8_t* src[],
2372                    int srcStride[], int srcSliceY,
2373                    int srcSliceH, uint8_t* dst[], int dstStride[])
2374 {
2375     /* load a few things into local vars to make the code more readable? and faster */
2376     const int srcW= c->srcW;
2377     const int dstW= c->dstW;
2378     const int dstH= c->dstH;
2379     const int chrDstW= c->chrDstW;
2380     const int chrSrcW= c->chrSrcW;
2381     const int lumXInc= c->lumXInc;
2382     const int chrXInc= c->chrXInc;
2383     const enum PixelFormat dstFormat= c->dstFormat;
2384     const int flags= c->flags;
2385     int16_t *vLumFilterPos= c->vLumFilterPos;
2386     int16_t *vChrFilterPos= c->vChrFilterPos;
2387     int16_t *hLumFilterPos= c->hLumFilterPos;
2388     int16_t *hChrFilterPos= c->hChrFilterPos;
2389     int16_t *vLumFilter= c->vLumFilter;
2390     int16_t *vChrFilter= c->vChrFilter;
2391     int16_t *hLumFilter= c->hLumFilter;
2392     int16_t *hChrFilter= c->hChrFilter;
2393     int32_t *lumMmxFilter= c->lumMmxFilter;
2394     int32_t *chrMmxFilter= c->chrMmxFilter;
2395     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2396     const int vLumFilterSize= c->vLumFilterSize;
2397     const int vChrFilterSize= c->vChrFilterSize;
2398     const int hLumFilterSize= c->hLumFilterSize;
2399     const int hChrFilterSize= c->hChrFilterSize;
2400     int16_t **lumPixBuf= c->lumPixBuf;
2401     int16_t **chrUPixBuf= c->chrUPixBuf;
2402     int16_t **chrVPixBuf= c->chrVPixBuf;
2403     int16_t **alpPixBuf= c->alpPixBuf;
2404     const int vLumBufSize= c->vLumBufSize;
2405     const int vChrBufSize= c->vChrBufSize;
2406     uint8_t *formatConvBuffer= c->formatConvBuffer;
2407     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2408     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2409     int lastDstY;
2410     uint32_t *pal=c->pal_yuv;
2411     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2412     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2413     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2414     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2415     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2416
2417     /* vars which will change and which we need to store back in the context */
2418     int dstY= c->dstY;
2419     int lumBufIndex= c->lumBufIndex;
2420     int chrBufIndex= c->chrBufIndex;
2421     int lastInLumBuf= c->lastInLumBuf;
2422     int lastInChrBuf= c->lastInChrBuf;
2423
2424     if (isPacked(c->srcFormat)) {
2425         src[0]=
2426         src[1]=
2427         src[2]=
2428         src[3]= src[0];
2429         srcStride[0]=
2430         srcStride[1]=
2431         srcStride[2]=
2432         srcStride[3]= srcStride[0];
2433     }
2434     srcStride[1]<<= c->vChrDrop;
2435     srcStride[2]<<= c->vChrDrop;
2436
2437     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2438                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2439                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2440     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2441                    srcSliceY,    srcSliceH,    dstY,    dstH);
2442     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2443                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2444
2445     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2446         static int warnedAlready=0; //FIXME move this into the context perhaps
2447         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2448             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2449                    "         ->cannot do aligned memory accesses anymore\n");
2450             warnedAlready=1;
2451         }
2452     }
2453
2454     /* Note the user might start scaling the picture in the middle so this
2455        will not get executed. This is not really intended but works
2456        currently, so people might do it. */
2457     if (srcSliceY ==0) {
2458         lumBufIndex=-1;
2459         chrBufIndex=-1;
2460         dstY=0;
2461         lastInLumBuf= -1;
2462         lastInChrBuf= -1;
2463     }
2464
2465     lastDstY= dstY;
2466
2467     for (;dstY < dstH; dstY++) {
2468         const int chrDstY= dstY>>c->chrDstVSubSample;
2469         uint8_t *dest[4] = {
2470             dst[0] + dstStride[0] * dstY,
2471             dst[1] + dstStride[1] * chrDstY,
2472             dst[2] + dstStride[2] * chrDstY,
2473             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2474         };
2475
2476         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2477         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2478         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2479         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2480         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2481         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2482         int enough_lines;
2483
2484         //handle holes (FAST_BILINEAR & weird filters)
2485         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2486         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2487         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2488         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2489
2490         DEBUG_BUFFERS("dstY: %d\n", dstY);
2491         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2492                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2493         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2494                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2495
2496         // Do we have enough lines in this slice to output the dstY line
2497         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2498
2499         if (!enough_lines) {
2500             lastLumSrcY = srcSliceY + srcSliceH - 1;
2501             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2502             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2503                                             lastLumSrcY, lastChrSrcY);
2504         }
2505
2506         //Do horizontal scaling
2507         while(lastInLumBuf < lastLumSrcY) {
2508             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2509             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2510             lumBufIndex++;
2511             assert(lumBufIndex < 2*vLumBufSize);
2512             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2513             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2514             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2515                     hLumFilter, hLumFilterPos, hLumFilterSize,
2516                     formatConvBuffer,
2517                     pal, 0);
2518             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2519                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2520                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2521                         formatConvBuffer,
2522                         pal, 1);
2523             lastInLumBuf++;
2524             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2525                                lumBufIndex,    lastInLumBuf);
2526         }
2527         while(lastInChrBuf < lastChrSrcY) {
2528             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2529             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2530             chrBufIndex++;
2531             assert(chrBufIndex < 2*vChrBufSize);
2532             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2533             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2534             //FIXME replace parameters through context struct (some at least)
2535
2536             if (c->needs_hcscale)
2537                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2538                           chrDstW, src1, src2, chrSrcW, chrXInc,
2539                           hChrFilter, hChrFilterPos, hChrFilterSize,
2540                           formatConvBuffer, pal);
2541             lastInChrBuf++;
2542             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2543                                chrBufIndex,    lastInChrBuf);
2544         }
2545         //wrap buf index around to stay inside the ring buffer
2546         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2547         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2548         if (!enough_lines)
2549             break; //we can't output a dstY line so let's try with the next slice
2550
2551 #if HAVE_MMX
2552         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2553 #endif
2554         if (dstY >= dstH-2) {
2555             // hmm looks like we can't use MMX here without overwriting this array's tail
2556             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2557                                            &yuv2packed1, &yuv2packed2,
2558                                            &yuv2packedX);
2559         }
2560
2561         {
2562             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2563             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2564             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2565             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2566             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2567                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2568                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2569                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2570                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2571                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2572                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2573                              dest, dstW, chrDstW);
2574                 } else { //General YV12
2575                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2576                              lumSrcPtr, vLumFilterSize,
2577                              vChrFilter + chrDstY * vChrFilterSize,
2578                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2579                              alpSrcPtr, dest, dstW, chrDstW);
2580                 }
2581             } else {
2582                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2583                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2584                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2585                     int chrAlpha = vChrFilter[2 * dstY + 1];
2586                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2587                                 alpPixBuf ? *alpSrcPtr : NULL,
2588                                 dest[0], dstW, chrAlpha, dstY);
2589                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2590                     int lumAlpha = vLumFilter[2 * dstY + 1];
2591                     int chrAlpha = vChrFilter[2 * dstY + 1];
2592                     lumMmxFilter[2] =
2593                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2594                     chrMmxFilter[2] =
2595                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2596                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2597                                 alpPixBuf ? alpSrcPtr : NULL,
2598                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2599                 } else { //general RGB
2600                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2601                                 lumSrcPtr, vLumFilterSize,
2602                                 vChrFilter + dstY * vChrFilterSize,
2603                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2604                                 alpSrcPtr, dest[0], dstW, dstY);
2605                 }
2606             }
2607         }
2608     }
2609
2610     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2611         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2612
2613 #if HAVE_MMX2
2614     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2615         __asm__ volatile("sfence":::"memory");
2616 #endif
2617     emms_c();
2618
2619     /* store changed local vars back in the context */
2620     c->dstY= dstY;
2621     c->lumBufIndex= lumBufIndex;
2622     c->chrBufIndex= chrBufIndex;
2623     c->lastInLumBuf= lastInLumBuf;
2624     c->lastInChrBuf= lastInChrBuf;
2625
2626     return dstY - lastDstY;
2627 }
2628
2629 static av_cold void sws_init_swScale_c(SwsContext *c)
2630 {
2631     enum PixelFormat srcFormat = c->srcFormat;
2632
2633     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2634                                    &c->yuv2packed1, &c->yuv2packed2,
2635                                    &c->yuv2packedX);
2636
2637     c->chrToYV12 = NULL;
2638     switch(srcFormat) {
2639         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2640         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2641         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2642         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2643         case PIX_FMT_RGB8     :
2644         case PIX_FMT_BGR8     :
2645         case PIX_FMT_PAL8     :
2646         case PIX_FMT_BGR4_BYTE:
2647         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2648         case PIX_FMT_YUV444P9LE:
2649         case PIX_FMT_YUV420P9LE:  c->chrToYV12 = LE9ToUV_c; break;
2650         case PIX_FMT_YUV422P10LE:
2651         case PIX_FMT_YUV444P10LE:
2652         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
2653         case PIX_FMT_YUV444P9BE:
2654         case PIX_FMT_YUV420P9BE:  c->chrToYV12 = BE9ToUV_c; break;
2655         case PIX_FMT_YUV444P10BE:
2656         case PIX_FMT_YUV422P10BE:
2657         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
2658 #if HAVE_BIGENDIAN
2659         case PIX_FMT_YUV420P16LE:
2660         case PIX_FMT_YUV422P16LE:
2661         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2662 #else
2663         case PIX_FMT_YUV420P16BE:
2664         case PIX_FMT_YUV422P16BE:
2665         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2666 #endif
2667     }
2668     if (c->chrSrcHSubSample) {
2669         switch(srcFormat) {
2670         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2671         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2672         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2673         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2674         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2675         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2676         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2677         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2678         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2679         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2680         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2681         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2682         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2683         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2684         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2685         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2686         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2687         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2688         }
2689     } else {
2690         switch(srcFormat) {
2691         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2692         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2693         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2694         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2695         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2696         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2697         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2698         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2699         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2700         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2701         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2702         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2703         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2704         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2705         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2706         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2707         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2708         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2709         }
2710     }
2711
2712     c->lumToYV12 = NULL;
2713     c->alpToYV12 = NULL;
2714     switch (srcFormat) {
2715     case PIX_FMT_YUV444P9LE:
2716     case PIX_FMT_YUV420P9LE:  c->lumToYV12 = LE9ToY_c; break;
2717     case PIX_FMT_YUV444P10LE:
2718     case PIX_FMT_YUV422P10LE:
2719     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
2720     case PIX_FMT_YUV444P9BE:
2721     case PIX_FMT_YUV420P9BE:  c->lumToYV12 = BE9ToY_c; break;
2722     case PIX_FMT_YUV444P10BE:
2723     case PIX_FMT_YUV422P10BE:
2724     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
2725 #if HAVE_BIGENDIAN
2726     case PIX_FMT_YUV420P16LE:
2727     case PIX_FMT_YUV422P16LE:
2728     case PIX_FMT_YUV444P16LE:
2729     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2730 #else
2731     case PIX_FMT_YUV420P16BE:
2732     case PIX_FMT_YUV422P16BE:
2733     case PIX_FMT_YUV444P16BE:
2734     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2735 #endif
2736     case PIX_FMT_YUYV422  :
2737     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2738     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2739     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2740     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2741     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2742     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2743     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2744     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2745     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2746     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2747     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2748     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2749     case PIX_FMT_RGB8     :
2750     case PIX_FMT_BGR8     :
2751     case PIX_FMT_PAL8     :
2752     case PIX_FMT_BGR4_BYTE:
2753     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2754     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2755     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2756     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2757     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2758     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2759     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2760     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2761     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2762     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2763     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2764     }
2765     if (c->alpPixBuf) {
2766         switch (srcFormat) {
2767         case PIX_FMT_BGRA:
2768         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2769         case PIX_FMT_ABGR:
2770         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2771         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2772         }
2773     }
2774
2775     if (c->scalingBpp == 8) {
2776     c->hScale       = hScale_c;
2777     if (c->flags & SWS_FAST_BILINEAR) {
2778         c->hyscale_fast = hyscale_fast_c;
2779         c->hcscale_fast = hcscale_fast_c;
2780     }
2781
2782     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2783         if (c->srcRange) {
2784             c->lumConvertRange = lumRangeFromJpeg_c;
2785             c->chrConvertRange = chrRangeFromJpeg_c;
2786         } else {
2787             c->lumConvertRange = lumRangeToJpeg_c;
2788             c->chrConvertRange = chrRangeToJpeg_c;
2789         }
2790     }
2791     } else {
2792         c->hScale = hScale16_c;
2793         c->scale19To15Fw = scale19To15Fw_c;
2794         c->scale8To16Rv  = scale8To16Rv_c;
2795
2796         if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2797             if (c->srcRange) {
2798                 c->lumConvertRange = lumRangeFromJpeg16_c;
2799                 c->chrConvertRange = chrRangeFromJpeg16_c;
2800             } else {
2801                 c->lumConvertRange = lumRangeToJpeg16_c;
2802                 c->chrConvertRange = chrRangeToJpeg16_c;
2803             }
2804         }
2805     }
2806
2807     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2808           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2809         c->needs_hcscale = 1;
2810 }
2811
2812 SwsFunc ff_getSwsFunc(SwsContext *c)
2813 {
2814     sws_init_swScale_c(c);
2815
2816     if (HAVE_MMX)
2817         ff_sws_init_swScale_mmx(c);
2818     if (HAVE_ALTIVEC)
2819         ff_sws_init_swScale_altivec(c);
2820
2821     return swScale;
2822 }