git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include <inttypes.h>
  22 #include <string.h>
  23 #include <math.h>
  24 #include <stdio.h>
  25 #include "config.h"
  26 #include <assert.h>
  27 #include "swscale.h"
  28 #include "swscale_internal.h"
  29 #include "rgb2rgb.h"
  30 #include "libavutil/intreadwrite.h"
  31 #include "libavutil/cpu.h"
  32 #include "libavutil/avutil.h"
  33 #include "libavutil/mathematics.h"
  34 #include "libavutil/bswap.h"
  35 #include "libavutil/pixdesc.h"
  36
  37 #define DITHER1XBPP
  38
  39 #define RGB2YUV_SHIFT 15
  40 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  41 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  42 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  43 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  44 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  45 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  46 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  47 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  48 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  49
  50 /*
  51 NOTES
  52 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  53
  54 TODO
  55 more intelligent misalignment avoidance for the horizontal scaler
  56 write special vertical cubic upscale version
  57 optimize C code (YV12 / minmax)
  58 add support for packed pixel YUV input & output
  59 add support for Y8 output
  60 optimize BGR24 & BGR32
  61 add BGR4 output support
  62 write special BGR->BGR scaler
  63 */
  64
  65 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  66 {  1,   3,   1,   3,   1,   3,   1,   3, },
  67 {  2,   0,   2,   0,   2,   0,   2,   0, },
  68 };
  69
  70 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
  71 {  6,   2,   6,   2,   6,   2,   6,   2, },
  72 {  0,   4,   0,   4,   0,   4,   0,   4, },
  73 };
  74
  75 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
  76 {  8,   4,  11,   7,   8,   4,  11,   7, },
  77 {  2,  14,   1,  13,   2,  14,   1,  13, },
  78 { 10,   6,   9,   5,  10,   6,   9,   5, },
  79 {  0,  12,   3,  15,   0,  12,   3,  15, },
  80 };
  81
  82 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
  83 { 17,   9,  23,  15,  16,   8,  22,  14, },
  84 {  5,  29,   3,  27,   4,  28,   2,  26, },
  85 { 21,  13,  19,  11,  20,  12,  18,  10, },
  86 {  0,  24,   6,  30,   1,  25,   7,  31, },
  87 { 16,   8,  22,  14,  17,   9,  23,  15, },
  88 {  4,  28,   2,  26,   5,  29,   3,  27, },
  89 { 20,  12,  18,  10,  21,  13,  19,  11, },
  90 {  1,  25,   7,  31,   0,  24,   6,  30, },
  91 };
  92
  93 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
  94 {  0,  55,  14,  68,   3,  58,  17,  72, },
  95 { 37,  18,  50,  32,  40,  22,  54,  35, },
  96 {  9,  64,   5,  59,  13,  67,   8,  63, },
  97 { 46,  27,  41,  23,  49,  31,  44,  26, },
  98 {  2,  57,  16,  71,   1,  56,  15,  70, },
  99 { 39,  21,  52,  34,  38,  19,  51,  33, },
 100 { 11,  66,   7,  62,  10,  65,   6,  60, },
 101 { 48,  30,  43,  25,  47,  29,  42,  24, },
 102 };
 103
 104 #if 1
 105 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 106 {117,  62, 158, 103, 113,  58, 155, 100, },
 107 { 34, 199,  21, 186,  31, 196,  17, 182, },
 108 {144,  89, 131,  76, 141,  86, 127,  72, },
 109 {  0, 165,  41, 206,  10, 175,  52, 217, },
 110 {110,  55, 151,  96, 120,  65, 162, 107, },
 111 { 28, 193,  14, 179,  38, 203,  24, 189, },
 112 {138,  83, 124,  69, 148,  93, 134,  79, },
 113 {  7, 172,  48, 213,   3, 168,  45, 210, },
 114 };
 115 #elif 1
 116 // tries to correct a gamma of 1.5
 117 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 118 {  0, 143,  18, 200,   2, 156,  25, 215, },
 119 { 78,  28, 125,  64,  89,  36, 138,  74, },
 120 { 10, 180,   3, 161,  16, 195,   8, 175, },
 121 {109,  51,  93,  38, 121,  60, 105,  47, },
 122 {  1, 152,  23, 210,   0, 147,  20, 205, },
 123 { 85,  33, 134,  71,  81,  30, 130,  67, },
 124 { 14, 190,   6, 171,  12, 185,   5, 166, },
 125 {117,  57, 101,  44, 113,  54,  97,  41, },
 126 };
 127 #elif 1
 128 // tries to correct a gamma of 2.0
 129 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 130 {  0, 124,   8, 193,   0, 140,  12, 213, },
 131 { 55,  14, 104,  42,  66,  19, 119,  52, },
 132 {  3, 168,   1, 145,   6, 187,   3, 162, },
 133 { 86,  31,  70,  21,  99,  39,  82,  28, },
 134 {  0, 134,  11, 206,   0, 129,   9, 200, },
 135 { 62,  17, 114,  48,  58,  16, 109,  45, },
 136 {  5, 181,   2, 157,   4, 175,   1, 151, },
 137 { 95,  36,  78,  26,  90,  34,  74,  24, },
 138 };
 139 #else
 140 // tries to correct a gamma of 2.5
 141 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 142 {  0, 107,   3, 187,   0, 125,   6, 212, },
 143 { 39,   7,  86,  28,  49,  11, 102,  36, },
 144 {  1, 158,   0, 131,   3, 180,   1, 151, },
 145 { 68,  19,  52,  12,  81,  25,  64,  17, },
 146 {  0, 119,   5, 203,   0, 113,   4, 195, },
 147 { 45,   9,  96,  33,  42,   8,  91,  30, },
 148 {  2, 172,   1, 144,   2, 165,   0, 137, },
 149 { 77,  23,  60,  15,  72,  21,  56,  14, },
 150 };
 151 #endif
 152 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 153 {  36, 68, 60, 92, 34, 66, 58, 90,},
 154 { 100,  4,124, 28, 98,  2,122, 26,},
 155 {  52, 84, 44, 76, 50, 82, 42, 74,},
 156 { 116, 20,108, 12,114, 18,106, 10,},
 157 {  32, 64, 56, 88, 38, 70, 62, 94,},
 158 {  96,  0,120, 24,102,  6,126, 30,},
 159 {  48, 80, 40, 72, 54, 86, 46, 78,},
 160 { 112, 16,104,  8,118, 22,110, 14,},
 161 };
 162 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 163 {  64, 64, 64, 64, 64, 64, 64, 64 };
 164
 165 #define output_pixel(pos, val, bias, signedness) \
 166     if (big_endian) { \
 167         AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 168     } else { \
 169         AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 170     }
 171
 172 static av_always_inline void
 173 yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
 174                          int big_endian, int output_bits)
 175 {
 176     int i;
 177     int shift = 19 - output_bits;
 178
 179     for (i = 0; i < dstW; i++) {
 180         int val = src[i] + (1 << (shift - 1));
 181         output_pixel(&dest[i], val, 0, uint);
 182     }
 183 }
 184
 185 static av_always_inline void
 186 yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
 187                          const int32_t **src, uint16_t *dest, int dstW,
 188                          int big_endian, int output_bits)
 189 {
 190     int i;
 191     int shift = 15 + 16 - output_bits;
 192
 193     for (i = 0; i < dstW; i++) {
 194         int val = 1 << (30-output_bits);
 195         int j;
 196
 197         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
 198          * filters (or anything with negative coeffs, the range can be slightly
 199          * wider in both directions. To account for this overflow, we subtract
 200          * a constant so it always fits in the signed range (assuming a
 201          * reasonable filterSize), and re-add that at the end. */
 202         val -= 0x40000000;
 203         for (j = 0; j < filterSize; j++)
 204             val += src[j][i] * filter[j];
 205
 206         output_pixel(&dest[i], val, 0x8000, int);
 207     }
 208 }
 209
 210 #undef output_pixel
 211
 212 #define output_pixel(pos, val) \
 213     if (big_endian) { \
 214         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 215     } else { \
 216         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 217     }
 218
 219 static av_always_inline void
 220 yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
 221                          int big_endian, int output_bits)
 222 {
 223     int i;
 224     int shift = 15 - output_bits;
 225
 226     for (i = 0; i < dstW; i++) {
 227         int val = src[i] + (1 << (shift - 1));
 228         output_pixel(&dest[i], val);
 229     }
 230 }
 231
 232 static av_always_inline void
 233 yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
 234                          const int16_t **src, uint16_t *dest, int dstW,
 235                          int big_endian, int output_bits)
 236 {
 237     int i;
 238     int shift = 11 + 16 - output_bits;
 239
 240     for (i = 0; i < dstW; i++) {
 241         int val = 1 << (26-output_bits);
 242         int j;
 243
 244         for (j = 0; j < filterSize; j++)
 245             val += src[j][i] * filter[j];
 246
 247         output_pixel(&dest[i], val);
 248     }
 249 }
 250
 251 #undef output_pixel
 252
 253 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
 254 static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
 255                               uint8_t *dest, int dstW, \
 256                               const uint8_t *dither, int offset)\
 257 { \
 258     yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
 259                          (uint16_t *) dest, dstW, is_be, bits); \
 260 }\
 261 static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
 262                               const int16_t **src, uint8_t *dest, int dstW, \
 263                               const uint8_t *dither, int offset)\
 264 { \
 265     yuv2planeX_## template_size ## _c_template(filter, \
 266                          filterSize, (const typeX_t **) src, \
 267                          (uint16_t *) dest, dstW, is_be, bits); \
 268 }
 269 yuv2NBPS( 9, BE, 1, 10, int16_t)
 270 yuv2NBPS( 9, LE, 0, 10, int16_t)
 271 yuv2NBPS(10, BE, 1, 10, int16_t)
 272 yuv2NBPS(10, LE, 0, 10, int16_t)
 273 yuv2NBPS(16, BE, 1, 16, int32_t)
 274 yuv2NBPS(16, LE, 0, 16, int32_t)
 275
 276 static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
 277                            const int16_t **src, uint8_t *dest, int dstW,
 278                            const uint8_t *dither, int offset)
 279 {
 280     int i;
 281     for (i=0; i<dstW; i++) {
 282         int val = dither[(i + offset) & 7] << 12;
 283         int j;
 284         for (j=0; j<filterSize; j++)
 285             val += src[j][i] * filter[j];
 286
 287         dest[i]= av_clip_uint8(val>>19);
 288     }
 289 }
 290
 291 static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
 292                            const uint8_t *dither, int offset)
 293 {
 294     int i;
 295     for (i=0; i<dstW; i++) {
 296         int val = (src[i] + dither[(i + offset) & 7]) >> 7;
 297         dest[i]= av_clip_uint8(val);
 298     }
 299 }
 300
 301 static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
 302                         const int16_t **chrUSrc, const int16_t **chrVSrc,
 303                         uint8_t *dest, int chrDstW)
 304 {
 305     enum PixelFormat dstFormat = c->dstFormat;
 306     const uint8_t *chrDither = c->chrDither8;
 307     int i;
 308
 309     if (dstFormat == PIX_FMT_NV12)
 310         for (i=0; i<chrDstW; i++) {
 311             int u = chrDither[i & 7] << 12;
 312             int v = chrDither[(i + 3) & 7] << 12;
 313             int j;
 314             for (j=0; j<chrFilterSize; j++) {
 315                 u += chrUSrc[j][i] * chrFilter[j];
 316                 v += chrVSrc[j][i] * chrFilter[j];
 317             }
 318
 319             dest[2*i]= av_clip_uint8(u>>19);
 320             dest[2*i+1]= av_clip_uint8(v>>19);
 321         }
 322     else
 323         for (i=0; i<chrDstW; i++) {
 324             int u = chrDither[i & 7] << 12;
 325             int v = chrDither[(i + 3) & 7] << 12;
 326             int j;
 327             for (j=0; j<chrFilterSize; j++) {
 328                 u += chrUSrc[j][i] * chrFilter[j];
 329                 v += chrVSrc[j][i] * chrFilter[j];
 330             }
 331
 332             dest[2*i]= av_clip_uint8(v>>19);
 333             dest[2*i+1]= av_clip_uint8(u>>19);
 334         }
 335 }
 336
 337 #define output_pixel(pos, val) \
 338         if (target == PIX_FMT_GRAY16BE) { \
 339             AV_WB16(pos, val); \
 340         } else { \
 341             AV_WL16(pos, val); \
 342         }
 343
 344 static av_always_inline void
 345 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 346                         const int32_t **lumSrc, int lumFilterSize,
 347                         const int16_t *chrFilter, const int32_t **chrUSrc,
 348                         const int32_t **chrVSrc, int chrFilterSize,
 349                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 350                         int y, enum PixelFormat target)
 351 {
 352     int i;
 353
 354     for (i = 0; i < (dstW >> 1); i++) {
 355         int j;
 356         int Y1 = (1 << 14) - 0x40000000;
 357         int Y2 = (1 << 14) - 0x40000000;
 358
 359         for (j = 0; j < lumFilterSize; j++) {
 360             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 361             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 362         }
 363         Y1 >>= 15;
 364         Y2 >>= 15;
 365         Y1 = av_clip_int16(Y1);
 366         Y2 = av_clip_int16(Y2);
 367         output_pixel(&dest[i * 2 + 0], 0x8000 + Y1);
 368         output_pixel(&dest[i * 2 + 1], 0x8000 + Y2);
 369     }
 370 }
 371
 372 static av_always_inline void
 373 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 374                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 375                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 376                         int yalpha, int uvalpha, int y,
 377                         enum PixelFormat target)
 378 {
 379     int  yalpha1 = 4095 - yalpha;
 380     int i;
 381     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 382
 383     for (i = 0; i < (dstW >> 1); i++) {
 384         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 385         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 386
 387         output_pixel(&dest[i * 2 + 0], Y1);
 388         output_pixel(&dest[i * 2 + 1], Y2);
 389     }
 390 }
 391
 392 static av_always_inline void
 393 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 394                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 395                         const int32_t *abuf0, uint16_t *dest, int dstW,
 396                         int uvalpha, int y, enum PixelFormat target)
 397 {
 398     int i;
 399
 400     for (i = 0; i < (dstW >> 1); i++) {
 401         int Y1 = buf0[i * 2    ] << 1;
 402         int Y2 = buf0[i * 2 + 1] << 1;
 403
 404         output_pixel(&dest[i * 2 + 0], Y1);
 405         output_pixel(&dest[i * 2 + 1], Y2);
 406     }
 407 }
 408
 409 #undef output_pixel
 410
 411 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 412 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 413                         const int16_t **_lumSrc, int lumFilterSize, \
 414                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 415                         const int16_t **_chrVSrc, int chrFilterSize, \
 416                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 417                         int y) \
 418 { \
 419     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 420                   **chrUSrc = (const int32_t **) _chrUSrc, \
 421                   **chrVSrc = (const int32_t **) _chrVSrc, \
 422                   **alpSrc  = (const int32_t **) _alpSrc; \
 423     uint16_t *dest = (uint16_t *) _dest; \
 424     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 425                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 426                           alpSrc, dest, dstW, y, fmt); \
 427 } \
 428  \
 429 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 430                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 431                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 432                         int yalpha, int uvalpha, int y) \
 433 { \
 434     const int32_t **buf  = (const int32_t **) _buf, \
 435                   **ubuf = (const int32_t **) _ubuf, \
 436                   **vbuf = (const int32_t **) _vbuf, \
 437                   **abuf = (const int32_t **) _abuf; \
 438     uint16_t *dest = (uint16_t *) _dest; \
 439     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 440                           dest, dstW, yalpha, uvalpha, y, fmt); \
 441 } \
 442  \
 443 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 444                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 445                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 446                         int uvalpha, int y) \
 447 { \
 448     const int32_t *buf0  = (const int32_t *)  _buf0, \
 449                  **ubuf  = (const int32_t **) _ubuf, \
 450                  **vbuf  = (const int32_t **) _vbuf, \
 451                   *abuf0 = (const int32_t *)  _abuf0; \
 452     uint16_t *dest = (uint16_t *) _dest; \
 453     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 454                                   dstW, uvalpha, y, fmt); \
 455 }
 456
 457 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE)
 458 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE)
 459
 460 #define output_pixel(pos, acc) \
 461     if (target == PIX_FMT_MONOBLACK) { \
 462         pos = acc; \
 463     } else { \
 464         pos = ~acc; \
 465     }
 466
 467 static av_always_inline void
 468 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 469                       const int16_t **lumSrc, int lumFilterSize,
 470                       const int16_t *chrFilter, const int16_t **chrUSrc,
 471                       const int16_t **chrVSrc, int chrFilterSize,
 472                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 473                       int y, enum PixelFormat target)
 474 {
 475     const uint8_t * const d128=dither_8x8_220[y&7];
 476     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 477     int i;
 478     unsigned acc = 0;
 479
 480     for (i = 0; i < dstW - 1; i += 2) {
 481         int j;
 482         int Y1 = 1 << 18;
 483         int Y2 = 1 << 18;
 484
 485         for (j = 0; j < lumFilterSize; j++) {
 486             Y1 += lumSrc[j][i]   * lumFilter[j];
 487             Y2 += lumSrc[j][i+1] * lumFilter[j];
 488         }
 489         Y1 >>= 19;
 490         Y2 >>= 19;
 491         if ((Y1 | Y2) & 0x100) {
 492             Y1 = av_clip_uint8(Y1);
 493             Y2 = av_clip_uint8(Y2);
 494         }
 495         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 496         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 497         if ((i & 7) == 6) {
 498             output_pixel(*dest++, acc);
 499         }
 500     }
 501 }
 502
 503 static av_always_inline void
 504 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 505                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 506                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 507                       int yalpha, int uvalpha, int y,
 508                       enum PixelFormat target)
 509 {
 510     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 511     const uint8_t * const d128 = dither_8x8_220[y & 7];
 512     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 513     int  yalpha1 = 4095 - yalpha;
 514     int i;
 515
 516     for (i = 0; i < dstW - 7; i += 8) {
 517         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 518         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 519         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 520         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 521         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 522         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 523         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 524         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 525         output_pixel(*dest++, acc);
 526     }
 527 }
 528
 529 static av_always_inline void
 530 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 531                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 532                       const int16_t *abuf0, uint8_t *dest, int dstW,
 533                       int uvalpha, int y, enum PixelFormat target)
 534 {
 535     const uint8_t * const d128 = dither_8x8_220[y & 7];
 536     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 537     int i;
 538
 539     for (i = 0; i < dstW - 7; i += 8) {
 540         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 541         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 542         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 543         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 544         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 545         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 546         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 547         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 548         output_pixel(*dest++, acc);
 549     }
 550 }
 551
 552 #undef output_pixel
 553
 554 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 555 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 556                                 const int16_t **lumSrc, int lumFilterSize, \
 557                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 558                                 const int16_t **chrVSrc, int chrFilterSize, \
 559                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 560                                 int y) \
 561 { \
 562     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 563                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 564                                   alpSrc, dest, dstW, y, fmt); \
 565 } \
 566  \
 567 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 568                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 569                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 570                                 int yalpha, int uvalpha, int y) \
 571 { \
 572     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 573                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 574 } \
 575  \
 576 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 577                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 578                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 579                                 int uvalpha, int y) \
 580 { \
 581     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 582                                   abuf0, dest, dstW, uvalpha, \
 583                                   y, fmt); \
 584 }
 585
 586 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE)
 587 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK)
 588
 589 #define output_pixels(pos, Y1, U, Y2, V) \
 590     if (target == PIX_FMT_YUYV422) { \
 591         dest[pos + 0] = Y1; \
 592         dest[pos + 1] = U;  \
 593         dest[pos + 2] = Y2; \
 594         dest[pos + 3] = V;  \
 595     } else { \
 596         dest[pos + 0] = U;  \
 597         dest[pos + 1] = Y1; \
 598         dest[pos + 2] = V;  \
 599         dest[pos + 3] = Y2; \
 600     }
 601
 602 static av_always_inline void
 603 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 604                      const int16_t **lumSrc, int lumFilterSize,
 605                      const int16_t *chrFilter, const int16_t **chrUSrc,
 606                      const int16_t **chrVSrc, int chrFilterSize,
 607                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 608                      int y, enum PixelFormat target)
 609 {
 610     int i;
 611
 612     for (i = 0; i < (dstW >> 1); i++) {
 613         int j;
 614         int Y1 = 1 << 18;
 615         int Y2 = 1 << 18;
 616         int U  = 1 << 18;
 617         int V  = 1 << 18;
 618
 619         for (j = 0; j < lumFilterSize; j++) {
 620             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 621             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 622         }
 623         for (j = 0; j < chrFilterSize; j++) {
 624             U += chrUSrc[j][i] * chrFilter[j];
 625             V += chrVSrc[j][i] * chrFilter[j];
 626         }
 627         Y1 >>= 19;
 628         Y2 >>= 19;
 629         U  >>= 19;
 630         V  >>= 19;
 631         if ((Y1 | Y2 | U | V) & 0x100) {
 632             Y1 = av_clip_uint8(Y1);
 633             Y2 = av_clip_uint8(Y2);
 634             U  = av_clip_uint8(U);
 635             V  = av_clip_uint8(V);
 636         }
 637         output_pixels(4*i, Y1, U, Y2, V);
 638     }
 639 }
 640
 641 static av_always_inline void
 642 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 643                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 644                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 645                      int yalpha, int uvalpha, int y,
 646                      enum PixelFormat target)
 647 {
 648     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 649                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 650                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 651     int  yalpha1 = 4095 - yalpha;
 652     int uvalpha1 = 4095 - uvalpha;
 653     int i;
 654
 655     for (i = 0; i < (dstW >> 1); i++) {
 656         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 657         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 658         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 659         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 660
 661         output_pixels(i * 4, Y1, U, Y2, V);
 662     }
 663 }
 664
 665 static av_always_inline void
 666 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 667                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 668                      const int16_t *abuf0, uint8_t *dest, int dstW,
 669                      int uvalpha, int y, enum PixelFormat target)
 670 {
 671     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 672                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 673     int i;
 674
 675     if (uvalpha < 2048) {
 676         for (i = 0; i < (dstW >> 1); i++) {
 677             int Y1 = buf0[i * 2]     >> 7;
 678             int Y2 = buf0[i * 2 + 1] >> 7;
 679             int U  = ubuf1[i]        >> 7;
 680             int V  = vbuf1[i]        >> 7;
 681
 682             output_pixels(i * 4, Y1, U, Y2, V);
 683         }
 684     } else {
 685         for (i = 0; i < (dstW >> 1); i++) {
 686             int Y1 =  buf0[i * 2]          >> 7;
 687             int Y2 =  buf0[i * 2 + 1]      >> 7;
 688             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 689             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 690
 691             output_pixels(i * 4, Y1, U, Y2, V);
 692         }
 693     }
 694 }
 695
 696 #undef output_pixels
 697
 698 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422)
 699 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422)
 700
 701 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 702 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 703 #define output_pixel(pos, val) \
 704     if (isBE(target)) { \
 705         AV_WB16(pos, val); \
 706     } else { \
 707         AV_WL16(pos, val); \
 708     }
 709
 710 static av_always_inline void
 711 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 712                        const int32_t **lumSrc, int lumFilterSize,
 713                        const int16_t *chrFilter, const int32_t **chrUSrc,
 714                        const int32_t **chrVSrc, int chrFilterSize,
 715                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 716                        int y, enum PixelFormat target)
 717 {
 718     int i;
 719
 720     for (i = 0; i < (dstW >> 1); i++) {
 721         int j;
 722         int Y1 = -0x40000000;
 723         int Y2 = -0x40000000;
 724         int U  = -128 << 23; // 19
 725         int V  = -128 << 23;
 726         int R, G, B;
 727
 728         for (j = 0; j < lumFilterSize; j++) {
 729             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 730             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 731         }
 732         for (j = 0; j < chrFilterSize; j++) {
 733             U += chrUSrc[j][i] * chrFilter[j];
 734             V += chrVSrc[j][i] * chrFilter[j];
 735         }
 736
 737         // 8bit: 12+15=27; 16-bit: 12+19=31
 738         Y1 >>= 14; // 10
 739         Y1 += 0x10000;
 740         Y2 >>= 14;
 741         Y2 += 0x10000;
 742         U  >>= 14;
 743         V  >>= 14;
 744
 745         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 746         Y1 -= c->yuv2rgb_y_offset;
 747         Y2 -= c->yuv2rgb_y_offset;
 748         Y1 *= c->yuv2rgb_y_coeff;
 749         Y2 *= c->yuv2rgb_y_coeff;
 750         Y1 += 1 << 13; // 21
 751         Y2 += 1 << 13;
 752         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 753
 754         R = V * c->yuv2rgb_v2r_coeff;
 755         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 756         B =                            U * c->yuv2rgb_u2b_coeff;
 757
 758         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 759         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 760         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 761         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 762         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 763         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 764         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 765         dest += 6;
 766     }
 767 }
 768
 769 static av_always_inline void
 770 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 771                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 772                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 773                        int yalpha, int uvalpha, int y,
 774                        enum PixelFormat target)
 775 {
 776     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 777                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 778                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 779     int  yalpha1 = 4095 - yalpha;
 780     int uvalpha1 = 4095 - uvalpha;
 781     int i;
 782
 783     for (i = 0; i < (dstW >> 1); i++) {
 784         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 785         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 786         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 787         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 788         int R, G, B;
 789
 790         Y1 -= c->yuv2rgb_y_offset;
 791         Y2 -= c->yuv2rgb_y_offset;
 792         Y1 *= c->yuv2rgb_y_coeff;
 793         Y2 *= c->yuv2rgb_y_coeff;
 794         Y1 += 1 << 13;
 795         Y2 += 1 << 13;
 796
 797         R = V * c->yuv2rgb_v2r_coeff;
 798         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 799         B =                            U * c->yuv2rgb_u2b_coeff;
 800
 801         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 802         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 803         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 804         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 805         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 806         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 807         dest += 6;
 808     }
 809 }
 810
 811 static av_always_inline void
 812 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 813                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 814                        const int32_t *abuf0, uint16_t *dest, int dstW,
 815                        int uvalpha, int y, enum PixelFormat target)
 816 {
 817     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 818                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 819     int i;
 820
 821     if (uvalpha < 2048) {
 822         for (i = 0; i < (dstW >> 1); i++) {
 823             int Y1 = (buf0[i * 2]    ) >> 2;
 824             int Y2 = (buf0[i * 2 + 1]) >> 2;
 825             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 826             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 827             int R, G, B;
 828
 829             Y1 -= c->yuv2rgb_y_offset;
 830             Y2 -= c->yuv2rgb_y_offset;
 831             Y1 *= c->yuv2rgb_y_coeff;
 832             Y2 *= c->yuv2rgb_y_coeff;
 833             Y1 += 1 << 13;
 834             Y2 += 1 << 13;
 835
 836             R = V * c->yuv2rgb_v2r_coeff;
 837             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 838             B =                            U * c->yuv2rgb_u2b_coeff;
 839
 840             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 841             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 842             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 843             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 844             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 845             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 846             dest += 6;
 847         }
 848     } else {
 849         for (i = 0; i < (dstW >> 1); i++) {
 850             int Y1 = (buf0[i * 2]    ) >> 2;
 851             int Y2 = (buf0[i * 2 + 1]) >> 2;
 852             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 11)) >> 3;
 853             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 11)) >> 3;
 854             int R, G, B;
 855
 856             Y1 -= c->yuv2rgb_y_offset;
 857             Y2 -= c->yuv2rgb_y_offset;
 858             Y1 *= c->yuv2rgb_y_coeff;
 859             Y2 *= c->yuv2rgb_y_coeff;
 860             Y1 += 1 << 13;
 861             Y2 += 1 << 13;
 862
 863             R = V * c->yuv2rgb_v2r_coeff;
 864             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 865             B =                            U * c->yuv2rgb_u2b_coeff;
 866
 867             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 868             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 869             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 870             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 871             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 872             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 873             dest += 6;
 874         }
 875     }
 876 }
 877
 878 #undef output_pixel
 879 #undef r_b
 880 #undef b_r
 881
 882 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE)
 883 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE)
 884 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE)
 885 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE)
 886
 887 /*
 888  * Write out 2 RGB pixels in the target pixel format. This function takes a
 889  * R/G/B LUT as generated by ff_yuv2rgb_c_init_tables(), which takes care of
 890  * things like endianness conversion and shifting. The caller takes care of
 891  * setting the correct offset in these tables from the chroma (U/V) values.
 892  * This function then uses the luminance (Y1/Y2) values to write out the
 893  * correct RGB values into the destination buffer.
 894  */
 895 static av_always_inline void
 896 yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
 897               unsigned A1, unsigned A2,
 898               const void *_r, const void *_g, const void *_b, int y,
 899               enum PixelFormat target, int hasAlpha)
 900 {
 901     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 902         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
 903         uint32_t *dest = (uint32_t *) _dest;
 904         const uint32_t *r = (const uint32_t *) _r;
 905         const uint32_t *g = (const uint32_t *) _g;
 906         const uint32_t *b = (const uint32_t *) _b;
 907
 908 #if CONFIG_SMALL
 909         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 910
 911         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
 912         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
 913 #else
 914         if (hasAlpha) {
 915             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
 916
 917             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
 918             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
 919         } else {
 920             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
 921             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
 922         }
 923 #endif
 924     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
 925         uint8_t *dest = (uint8_t *) _dest;
 926         const uint8_t *r = (const uint8_t *) _r;
 927         const uint8_t *g = (const uint8_t *) _g;
 928         const uint8_t *b = (const uint8_t *) _b;
 929
 930 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
 931 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
 932         dest[i * 6 + 0] = r_b[Y1];
 933         dest[i * 6 + 1] =   g[Y1];
 934         dest[i * 6 + 2] = b_r[Y1];
 935         dest[i * 6 + 3] = r_b[Y2];
 936         dest[i * 6 + 4] =   g[Y2];
 937         dest[i * 6 + 5] = b_r[Y2];
 938 #undef r_b
 939 #undef b_r
 940     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
 941                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
 942                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
 943         uint16_t *dest = (uint16_t *) _dest;
 944         const uint16_t *r = (const uint16_t *) _r;
 945         const uint16_t *g = (const uint16_t *) _g;
 946         const uint16_t *b = (const uint16_t *) _b;
 947         int dr1, dg1, db1, dr2, dg2, db2;
 948
 949         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
 950             dr1 = dither_2x2_8[ y & 1     ][0];
 951             dg1 = dither_2x2_4[ y & 1     ][0];
 952             db1 = dither_2x2_8[(y & 1) ^ 1][0];
 953             dr2 = dither_2x2_8[ y & 1     ][1];
 954             dg2 = dither_2x2_4[ y & 1     ][1];
 955             db2 = dither_2x2_8[(y & 1) ^ 1][1];
 956         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
 957             dr1 = dither_2x2_8[ y & 1     ][0];
 958             dg1 = dither_2x2_8[ y & 1     ][1];
 959             db1 = dither_2x2_8[(y & 1) ^ 1][0];
 960             dr2 = dither_2x2_8[ y & 1     ][1];
 961             dg2 = dither_2x2_8[ y & 1     ][0];
 962             db2 = dither_2x2_8[(y & 1) ^ 1][1];
 963         } else {
 964             dr1 = dither_4x4_16[ y & 3     ][0];
 965             dg1 = dither_4x4_16[ y & 3     ][1];
 966             db1 = dither_4x4_16[(y & 3) ^ 3][0];
 967             dr2 = dither_4x4_16[ y & 3     ][1];
 968             dg2 = dither_4x4_16[ y & 3     ][0];
 969             db2 = dither_4x4_16[(y & 3) ^ 3][1];
 970         }
 971
 972         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
 973         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
 974     } else /* 8/4-bit */ {
 975         uint8_t *dest = (uint8_t *) _dest;
 976         const uint8_t *r = (const uint8_t *) _r;
 977         const uint8_t *g = (const uint8_t *) _g;
 978         const uint8_t *b = (const uint8_t *) _b;
 979         int dr1, dg1, db1, dr2, dg2, db2;
 980
 981         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
 982             const uint8_t * const d64 = dither_8x8_73[y & 7];
 983             const uint8_t * const d32 = dither_8x8_32[y & 7];
 984             dr1 = dg1 = d32[(i * 2 + 0) & 7];
 985             db1 =       d64[(i * 2 + 0) & 7];
 986             dr2 = dg2 = d32[(i * 2 + 1) & 7];
 987             db2 =       d64[(i * 2 + 1) & 7];
 988         } else {
 989             const uint8_t * const d64  = dither_8x8_73 [y & 7];
 990             const uint8_t * const d128 = dither_8x8_220[y & 7];
 991             dr1 = db1 = d128[(i * 2 + 0) & 7];
 992             dg1 =        d64[(i * 2 + 0) & 7];
 993             dr2 = db2 = d128[(i * 2 + 1) & 7];
 994             dg2 =        d64[(i * 2 + 1) & 7];
 995         }
 996
 997         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
 998             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
 999                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1000         } else {
1001             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1002             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1003         }
1004     }
1005 }
1006
1007 static av_always_inline void
1008 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1009                      const int16_t **lumSrc, int lumFilterSize,
1010                      const int16_t *chrFilter, const int16_t **chrUSrc,
1011                      const int16_t **chrVSrc, int chrFilterSize,
1012                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1013                      int y, enum PixelFormat target, int hasAlpha)
1014 {
1015     int i;
1016
1017     for (i = 0; i < (dstW >> 1); i++) {
1018         int j;
1019         int Y1 = 1 << 18;
1020         int Y2 = 1 << 18;
1021         int U  = 1 << 18;
1022         int V  = 1 << 18;
1023         int av_unused A1, A2;
1024         const void *r, *g, *b;
1025
1026         for (j = 0; j < lumFilterSize; j++) {
1027             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1028             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1029         }
1030         for (j = 0; j < chrFilterSize; j++) {
1031             U += chrUSrc[j][i] * chrFilter[j];
1032             V += chrVSrc[j][i] * chrFilter[j];
1033         }
1034         Y1 >>= 19;
1035         Y2 >>= 19;
1036         U  >>= 19;
1037         V  >>= 19;
1038         if ((Y1 | Y2 | U | V) & 0x100) {
1039             Y1 = av_clip_uint8(Y1);
1040             Y2 = av_clip_uint8(Y2);
1041             U  = av_clip_uint8(U);
1042             V  = av_clip_uint8(V);
1043         }
1044         if (hasAlpha) {
1045             A1 = 1 << 18;
1046             A2 = 1 << 18;
1047             for (j = 0; j < lumFilterSize; j++) {
1048                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1049                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1050             }
1051             A1 >>= 19;
1052             A2 >>= 19;
1053             if ((A1 | A2) & 0x100) {
1054                 A1 = av_clip_uint8(A1);
1055                 A2 = av_clip_uint8(A2);
1056             }
1057         }
1058
1059         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1060         r =  c->table_rV[V];
1061         g = (c->table_gU[U] + c->table_gV[V]);
1062         b =  c->table_bU[U];
1063
1064         yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1065                       r, g, b, y, target, hasAlpha);
1066     }
1067 }
1068
1069 static av_always_inline void
1070 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1071                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1072                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1073                      int yalpha, int uvalpha, int y,
1074                      enum PixelFormat target, int hasAlpha)
1075 {
1076     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1077                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1078                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1079                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1080                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1081     int  yalpha1 = 4095 - yalpha;
1082     int uvalpha1 = 4095 - uvalpha;
1083     int i;
1084
1085     for (i = 0; i < (dstW >> 1); i++) {
1086         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1087         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1088         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1089         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1090         int A1, A2;
1091         const void *r =  c->table_rV[V],
1092                    *g = (c->table_gU[U] + c->table_gV[V]),
1093                    *b =  c->table_bU[U];
1094
1095         if (hasAlpha) {
1096             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1097             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1098         }
1099
1100         yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1101                       r, g, b, y, target, hasAlpha);
1102     }
1103 }
1104
1105 static av_always_inline void
1106 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1107                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1108                      const int16_t *abuf0, uint8_t *dest, int dstW,
1109                      int uvalpha, int y, enum PixelFormat target,
1110                      int hasAlpha)
1111 {
1112     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1113                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1114     int i;
1115
1116     if (uvalpha < 2048) {
1117         for (i = 0; i < (dstW >> 1); i++) {
1118             int Y1 = buf0[i * 2]     >> 7;
1119             int Y2 = buf0[i * 2 + 1] >> 7;
1120             int U  = ubuf1[i]        >> 7;
1121             int V  = vbuf1[i]        >> 7;
1122             int A1, A2;
1123             const void *r =  c->table_rV[V],
1124                        *g = (c->table_gU[U] + c->table_gV[V]),
1125                        *b =  c->table_bU[U];
1126
1127             if (hasAlpha) {
1128                 A1 = abuf0[i * 2    ] >> 7;
1129                 A2 = abuf0[i * 2 + 1] >> 7;
1130             }
1131
1132             yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1133                           r, g, b, y, target, hasAlpha);
1134         }
1135     } else {
1136         for (i = 0; i < (dstW >> 1); i++) {
1137             int Y1 =  buf0[i * 2]          >> 7;
1138             int Y2 =  buf0[i * 2 + 1]      >> 7;
1139             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1140             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1141             int A1, A2;
1142             const void *r =  c->table_rV[V],
1143                        *g = (c->table_gU[U] + c->table_gV[V]),
1144                        *b =  c->table_bU[U];
1145
1146             if (hasAlpha) {
1147                 A1 = abuf0[i * 2    ] >> 7;
1148                 A2 = abuf0[i * 2 + 1] >> 7;
1149             }
1150
1151             yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1152                           r, g, b, y, target, hasAlpha);
1153         }
1154     }
1155 }
1156
1157 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1158 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1159                                 const int16_t **lumSrc, int lumFilterSize, \
1160                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1161                                 const int16_t **chrVSrc, int chrFilterSize, \
1162                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1163                                 int y) \
1164 { \
1165     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1166                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1167                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1168 }
1169 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1170 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1171 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1172                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1173                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1174                                 int yalpha, int uvalpha, int y) \
1175 { \
1176     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1177                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1178 } \
1179  \
1180 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1181                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1182                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1183                                 int uvalpha, int y) \
1184 { \
1185     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1186                                   dstW, uvalpha, y, fmt, hasAlpha); \
1187 }
1188
1189 #if CONFIG_SMALL
1190 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1191 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1192 #else
1193 #if CONFIG_SWSCALE_ALPHA
1194 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1)
1195 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1)
1196 #endif
1197 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0)
1198 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0)
1199 #endif
1200 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0)
1201 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0)
1202 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0)
1203 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0)
1204 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0)
1205 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0)
1206 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0)
1207 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0)
1208
1209 static av_always_inline void
1210 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1211                           const int16_t **lumSrc, int lumFilterSize,
1212                           const int16_t *chrFilter, const int16_t **chrUSrc,
1213                           const int16_t **chrVSrc, int chrFilterSize,
1214                           const int16_t **alpSrc, uint8_t *dest,
1215                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1216 {
1217     int i;
1218     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1219
1220     for (i = 0; i < dstW; i++) {
1221         int j;
1222         int Y = 0;
1223         int U = -128 << 19;
1224         int V = -128 << 19;
1225         int av_unused A;
1226         int R, G, B;
1227
1228         for (j = 0; j < lumFilterSize; j++) {
1229             Y += lumSrc[j][i] * lumFilter[j];
1230         }
1231         for (j = 0; j < chrFilterSize; j++) {
1232             U += chrUSrc[j][i] * chrFilter[j];
1233             V += chrVSrc[j][i] * chrFilter[j];
1234         }
1235         Y >>= 10;
1236         U >>= 10;
1237         V >>= 10;
1238         if (hasAlpha) {
1239             A = 1 << 21;
1240             for (j = 0; j < lumFilterSize; j++) {
1241                 A += alpSrc[j][i] * lumFilter[j];
1242             }
1243             A >>= 19;
1244             if (A & 0x100)
1245                 A = av_clip_uint8(A);
1246         }
1247         Y -= c->yuv2rgb_y_offset;
1248         Y *= c->yuv2rgb_y_coeff;
1249         Y += 1 << 21;
1250         R = Y + V*c->yuv2rgb_v2r_coeff;
1251         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1252         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1253         if ((R | G | B) & 0xC0000000) {
1254             R = av_clip_uintp2(R, 30);
1255             G = av_clip_uintp2(G, 30);
1256             B = av_clip_uintp2(B, 30);
1257         }
1258
1259         switch(target) {
1260         case PIX_FMT_ARGB:
1261             dest[0] = hasAlpha ? A : 255;
1262             dest[1] = R >> 22;
1263             dest[2] = G >> 22;
1264             dest[3] = B >> 22;
1265             break;
1266         case PIX_FMT_RGB24:
1267             dest[0] = R >> 22;
1268             dest[1] = G >> 22;
1269             dest[2] = B >> 22;
1270             break;
1271         case PIX_FMT_RGBA:
1272             dest[0] = R >> 22;
1273             dest[1] = G >> 22;
1274             dest[2] = B >> 22;
1275             dest[3] = hasAlpha ? A : 255;
1276             break;
1277         case PIX_FMT_ABGR:
1278             dest[0] = hasAlpha ? A : 255;
1279             dest[1] = B >> 22;
1280             dest[2] = G >> 22;
1281             dest[3] = R >> 22;
1282             dest += 4;
1283             break;
1284         case PIX_FMT_BGR24:
1285             dest[0] = B >> 22;
1286             dest[1] = G >> 22;
1287             dest[2] = R >> 22;
1288             break;
1289         case PIX_FMT_BGRA:
1290             dest[0] = B >> 22;
1291             dest[1] = G >> 22;
1292             dest[2] = R >> 22;
1293             dest[3] = hasAlpha ? A : 255;
1294             break;
1295         }
1296         dest += step;
1297     }
1298 }
1299
1300 #if CONFIG_SMALL
1301 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1302 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1303 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1304 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1305 #else
1306 #if CONFIG_SWSCALE_ALPHA
1307 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1)
1308 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1)
1309 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1)
1310 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1)
1311 #endif
1312 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0)
1313 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0)
1314 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0)
1315 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0)
1316 #endif
1317 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0)
1318 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0)
1319
1320 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1321                                        int width, int height,
1322                                        int y, uint8_t val)
1323 {
1324     int i;
1325     uint8_t *ptr = plane + stride*y;
1326     for (i=0; i<height; i++) {
1327         memset(ptr, val, width);
1328         ptr += stride;
1329     }
1330 }
1331
1332 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1333
1334 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1335 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1336
1337 static av_always_inline void
1338 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1339                     enum PixelFormat origin)
1340 {
1341     int i;
1342     for (i = 0; i < width; i++) {
1343         unsigned int r_b = input_pixel(&src[i*3+0]);
1344         unsigned int   g = input_pixel(&src[i*3+1]);
1345         unsigned int b_r = input_pixel(&src[i*3+2]);
1346
1347         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1348     }
1349 }
1350
1351 static av_always_inline void
1352 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1353                     const uint16_t *src1, const uint16_t *src2,
1354                     int width, enum PixelFormat origin)
1355 {
1356     int i;
1357     assert(src1==src2);
1358     for (i = 0; i < width; i++) {
1359         int r_b = input_pixel(&src1[i*3+0]);
1360         int   g = input_pixel(&src1[i*3+1]);
1361         int b_r = input_pixel(&src1[i*3+2]);
1362
1363         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1364         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1365     }
1366 }
1367
1368 static av_always_inline void
1369 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1370                           const uint16_t *src1, const uint16_t *src2,
1371                           int width, enum PixelFormat origin)
1372 {
1373     int i;
1374     assert(src1==src2);
1375     for (i = 0; i < width; i++) {
1376         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1377         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1378         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1379
1380         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1381         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1382     }
1383 }
1384
1385 #undef r
1386 #undef b
1387 #undef input_pixel
1388
1389 #define rgb48funcs(pattern, BE_LE, origin) \
1390 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1391                                     int width, uint32_t *unused) \
1392 { \
1393     const uint16_t *src = (const uint16_t *) _src; \
1394     uint16_t *dst = (uint16_t *) _dst; \
1395     rgb48ToY_c_template(dst, src, width, origin); \
1396 } \
1397  \
1398 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1399                                     const uint8_t *_src1, const uint8_t *_src2, \
1400                                     int width, uint32_t *unused) \
1401 { \
1402     const uint16_t *src1 = (const uint16_t *) _src1, \
1403                    *src2 = (const uint16_t *) _src2; \
1404     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1405     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1406 } \
1407  \
1408 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1409                                     const uint8_t *_src1, const uint8_t *_src2, \
1410                                     int width, uint32_t *unused) \
1411 { \
1412     const uint16_t *src1 = (const uint16_t *) _src1, \
1413                    *src2 = (const uint16_t *) _src2; \
1414     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1415     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1416 }
1417
1418 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE)
1419 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE)
1420 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE)
1421 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE)
1422
1423 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1424                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1425                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1426
1427 static av_always_inline void
1428 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1429                        int width, enum PixelFormat origin,
1430                        int shr,   int shg,   int shb, int shp,
1431                        int maskr, int maskg, int maskb,
1432                        int rsh,   int gsh,   int bsh, int S)
1433 {
1434     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh;
1435     const unsigned rnd = 33u << (S - 1);
1436     int i;
1437
1438     for (i = 0; i < width; i++) {
1439         int px = input_pixel(i) >> shp;
1440         int b = (px & maskb) >> shb;
1441         int g = (px & maskg) >> shg;
1442         int r = (px & maskr) >> shr;
1443
1444         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1445     }
1446 }
1447
1448 static av_always_inline void
1449 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1450                         const uint8_t *src, int width,
1451                         enum PixelFormat origin,
1452                         int shr,   int shg,   int shb, int shp,
1453                         int maskr, int maskg, int maskb,
1454                         int rsh,   int gsh,   int bsh, int S)
1455 {
1456     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1457               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh;
1458     const unsigned rnd = 257u << (S - 1);
1459     int i;
1460
1461     for (i = 0; i < width; i++) {
1462         int px = input_pixel(i) >> shp;
1463         int b = (px & maskb) >> shb;
1464         int g = (px & maskg) >> shg;
1465         int r = (px & maskr) >> shr;
1466
1467         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1468         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1469     }
1470 }
1471
1472 static av_always_inline void
1473 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1474                              const uint8_t *src, int width,
1475                              enum PixelFormat origin,
1476                              int shr,   int shg,   int shb, int shp,
1477                              int maskr, int maskg, int maskb,
1478                              int rsh,   int gsh,   int bsh, int S)
1479 {
1480     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1481               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1482               maskgx = ~(maskr | maskb);
1483     const unsigned rnd = 257u << S;
1484     int i;
1485
1486     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1487     for (i = 0; i < width; i++) {
1488         int px0 = input_pixel(2 * i + 0) >> shp;
1489         int px1 = input_pixel(2 * i + 1) >> shp;
1490         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1491         int rb = px0 + px1 - g;
1492
1493         b = (rb & maskb) >> shb;
1494         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1495             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1496             g >>= shg;
1497         } else {
1498             g = (g  & maskg) >> shg;
1499         }
1500         r = (rb & maskr) >> shr;
1501
1502         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1503         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1504     }
1505 }
1506
1507 #undef input_pixel
1508
1509 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1510                          maskg, maskb, rsh, gsh, bsh, S) \
1511 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1512                           int width, uint32_t *unused) \
1513 { \
1514     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1515                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1516 } \
1517  \
1518 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1519                            const uint8_t *src, const uint8_t *dummy, \
1520                            int width, uint32_t *unused) \
1521 { \
1522     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1523                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1524 } \
1525  \
1526 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1527                                 const uint8_t *src, const uint8_t *dummy, \
1528                                 int width, uint32_t *unused) \
1529 { \
1530     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1531                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1532 }
1533
1534 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
1535 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
1536 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
1537 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
1538 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
1539 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
1540 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
1541 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
1542 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
1543 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
1544 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
1545 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
1546
1547 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1548 {
1549     int i;
1550     for (i=0; i<width; i++) {
1551         dst[i]= src[4*i];
1552     }
1553 }
1554
1555 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1556 {
1557     int i;
1558     for (i=0; i<width; i++) {
1559         dst[i]= src[4*i+3];
1560     }
1561 }
1562
1563 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1564 {
1565     int i;
1566     for (i=0; i<width; i++) {
1567         int d= src[i];
1568
1569         dst[i]= pal[d] & 0xFF;
1570     }
1571 }
1572
1573 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1574                       const uint8_t *src1, const uint8_t *src2,
1575                       int width, uint32_t *pal)
1576 {
1577     int i;
1578     assert(src1 == src2);
1579     for (i=0; i<width; i++) {
1580         int p= pal[src1[i]];
1581
1582         dstU[i]= p>>8;
1583         dstV[i]= p>>16;
1584     }
1585 }
1586
1587 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1588                           int width, uint32_t *unused)
1589 {
1590     int i, j;
1591     for (i=0; i<width/8; i++) {
1592         int d= ~src[i];
1593         for(j=0; j<8; j++)
1594             dst[8*i+j]= ((d>>(7-j))&1)*255;
1595     }
1596 }
1597
1598 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1599                           int width, uint32_t *unused)
1600 {
1601     int i, j;
1602     for (i=0; i<width/8; i++) {
1603         int d= src[i];
1604         for(j=0; j<8; j++)
1605             dst[8*i+j]= ((d>>(7-j))&1)*255;
1606     }
1607 }
1608
1609 //FIXME yuy2* can read up to 7 samples too much
1610
1611 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1612                       uint32_t *unused)
1613 {
1614     int i;
1615     for (i=0; i<width; i++)
1616         dst[i]= src[2*i];
1617 }
1618
1619 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1620                        const uint8_t *src2, int width, uint32_t *unused)
1621 {
1622     int i;
1623     for (i=0; i<width; i++) {
1624         dstU[i]= src1[4*i + 1];
1625         dstV[i]= src1[4*i + 3];
1626     }
1627     assert(src1 == src2);
1628 }
1629
1630 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1631 {
1632     int i;
1633     const uint16_t *src = (const uint16_t *) _src;
1634     uint16_t *dst = (uint16_t *) _dst;
1635     for (i=0; i<width; i++) {
1636         dst[i] = av_bswap16(src[i]);
1637     }
1638 }
1639
1640 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1641                         const uint8_t *_src2, int width, uint32_t *unused)
1642 {
1643     int i;
1644     const uint16_t *src1 = (const uint16_t *) _src1,
1645                    *src2 = (const uint16_t *) _src2;
1646     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1647     for (i=0; i<width; i++) {
1648         dstU[i] = av_bswap16(src1[i]);
1649         dstV[i] = av_bswap16(src2[i]);
1650     }
1651 }
1652
1653 /* This is almost identical to the previous, end exists only because
1654  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1655 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1656                       uint32_t *unused)
1657 {
1658     int i;
1659     for (i=0; i<width; i++)
1660         dst[i]= src[2*i+1];
1661 }
1662
1663 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1664                        const uint8_t *src2, int width, uint32_t *unused)
1665 {
1666     int i;
1667     for (i=0; i<width; i++) {
1668         dstU[i]= src1[4*i + 0];
1669         dstV[i]= src1[4*i + 2];
1670     }
1671     assert(src1 == src2);
1672 }
1673
1674 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1675                                         const uint8_t *src, int width)
1676 {
1677     int i;
1678     for (i = 0; i < width; i++) {
1679         dst1[i] = src[2*i+0];
1680         dst2[i] = src[2*i+1];
1681     }
1682 }
1683
1684 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1685                        const uint8_t *src1, const uint8_t *src2,
1686                        int width, uint32_t *unused)
1687 {
1688     nvXXtoUV_c(dstU, dstV, src1, width);
1689 }
1690
1691 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1692                        const uint8_t *src1, const uint8_t *src2,
1693                        int width, uint32_t *unused)
1694 {
1695     nvXXtoUV_c(dstV, dstU, src1, width);
1696 }
1697
1698 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1699
1700 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1701                        int width, uint32_t *unused)
1702 {
1703     int i;
1704     for (i=0; i<width; i++) {
1705         int b= src[i*3+0];
1706         int g= src[i*3+1];
1707         int r= src[i*3+2];
1708
1709         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1710     }
1711 }
1712
1713 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1714                         const uint8_t *src2, int width, uint32_t *unused)
1715 {
1716     int i;
1717     for (i=0; i<width; i++) {
1718         int b= src1[3*i + 0];
1719         int g= src1[3*i + 1];
1720         int r= src1[3*i + 2];
1721
1722         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1723         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1724     }
1725     assert(src1 == src2);
1726 }
1727
1728 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1729                              const uint8_t *src2, int width, uint32_t *unused)
1730 {
1731     int i;
1732     for (i=0; i<width; i++) {
1733         int b= src1[6*i + 0] + src1[6*i + 3];
1734         int g= src1[6*i + 1] + src1[6*i + 4];
1735         int r= src1[6*i + 2] + src1[6*i + 5];
1736
1737         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1738         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1739     }
1740     assert(src1 == src2);
1741 }
1742
1743 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1744                        uint32_t *unused)
1745 {
1746     int i;
1747     for (i=0; i<width; i++) {
1748         int r= src[i*3+0];
1749         int g= src[i*3+1];
1750         int b= src[i*3+2];
1751
1752         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1753     }
1754 }
1755
1756 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1757                         const uint8_t *src2, int width, uint32_t *unused)
1758 {
1759     int i;
1760     assert(src1==src2);
1761     for (i=0; i<width; i++) {
1762         int r= src1[3*i + 0];
1763         int g= src1[3*i + 1];
1764         int b= src1[3*i + 2];
1765
1766         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1767         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1768     }
1769 }
1770
1771 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1772                              const uint8_t *src2, int width, uint32_t *unused)
1773 {
1774     int i;
1775     assert(src1==src2);
1776     for (i=0; i<width; i++) {
1777         int r= src1[6*i + 0] + src1[6*i + 3];
1778         int g= src1[6*i + 1] + src1[6*i + 4];
1779         int b= src1[6*i + 2] + src1[6*i + 5];
1780
1781         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1782         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1783     }
1784 }
1785
1786 static void planar_rgb_to_y(uint8_t *dst, const uint8_t *src[4], int width)
1787 {
1788     int i;
1789     for (i = 0; i < width; i++) {
1790         int g = src[0][i];
1791         int b = src[1][i];
1792         int r = src[2][i];
1793
1794         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1795     }
1796 }
1797
1798 static void planar_rgb16le_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1799 {
1800     int i;
1801     const uint16_t **src = (const uint16_t **) _src;
1802     uint16_t *dst = (uint16_t *) _dst;
1803     for (i = 0; i < width; i++) {
1804         int g = AV_RL16(src[0] + i);
1805         int b = AV_RL16(src[1] + i);
1806         int r = AV_RL16(src[2] + i);
1807
1808         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1809     }
1810 }
1811
1812 static void planar_rgb16be_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1813 {
1814     int i;
1815     const uint16_t **src = (const uint16_t **) _src;
1816     uint16_t *dst = (uint16_t *) _dst;
1817     for (i = 0; i < width; i++) {
1818         int g = AV_RB16(src[0] + i);
1819         int b = AV_RB16(src[1] + i);
1820         int r = AV_RB16(src[2] + i);
1821
1822         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1823     }
1824 }
1825
1826 static void planar_rgb_to_uv(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4], int width)
1827 {
1828     int i;
1829     for (i = 0; i < width; i++) {
1830         int g = src[0][i];
1831         int b = src[1][i];
1832         int r = src[2][i];
1833
1834         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1835         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1836     }
1837 }
1838
1839 static void planar_rgb16le_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
1840 {
1841     int i;
1842     const uint16_t **src = (const uint16_t **) _src;
1843     uint16_t *dstU = (uint16_t *) _dstU;
1844     uint16_t *dstV = (uint16_t *) _dstV;
1845     for (i = 0; i < width; i++) {
1846         int g = AV_RL16(src[0] + i);
1847         int b = AV_RL16(src[1] + i);
1848         int r = AV_RL16(src[2] + i);
1849
1850         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1851         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1852     }
1853 }
1854
1855 static void planar_rgb16be_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
1856 {
1857     int i;
1858     const uint16_t **src = (const uint16_t **) _src;
1859     uint16_t *dstU = (uint16_t *) _dstU;
1860     uint16_t *dstV = (uint16_t *) _dstV;
1861     for (i = 0; i < width; i++) {
1862         int g = AV_RB16(src[0] + i);
1863         int b = AV_RB16(src[1] + i);
1864         int r = AV_RB16(src[2] + i);
1865
1866         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1867         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1868     }
1869 }
1870
1871 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
1872                            const int16_t *filter,
1873                            const int16_t *filterPos, int filterSize)
1874 {
1875     int i;
1876     int32_t *dst = (int32_t *) _dst;
1877     const uint16_t *src = (const uint16_t *) _src;
1878     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1879     int sh = bits - 4;
1880
1881     for (i = 0; i < dstW; i++) {
1882         int j;
1883         int srcPos = filterPos[i];
1884         int val = 0;
1885
1886         for (j = 0; j < filterSize; j++) {
1887             val += src[srcPos + j] * filter[filterSize * i + j];
1888         }
1889         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1890         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1891     }
1892 }
1893
1894 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
1895                            const int16_t *filter,
1896                            const int16_t *filterPos, int filterSize)
1897 {
1898     int i;
1899     const uint16_t *src = (const uint16_t *) _src;
1900     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1901
1902     for (i = 0; i < dstW; i++) {
1903         int j;
1904         int srcPos = filterPos[i];
1905         int val = 0;
1906
1907         for (j = 0; j < filterSize; j++) {
1908             val += src[srcPos + j] * filter[filterSize * i + j];
1909         }
1910         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
1911         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
1912     }
1913 }
1914
1915 // bilinear / bicubic scaling
1916 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
1917                           const int16_t *filter, const int16_t *filterPos,
1918                           int filterSize)
1919 {
1920     int i;
1921     for (i=0; i<dstW; i++) {
1922         int j;
1923         int srcPos= filterPos[i];
1924         int val=0;
1925         for (j=0; j<filterSize; j++) {
1926             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1927         }
1928         //filter += hFilterSize;
1929         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1930         //dst[i] = val>>7;
1931     }
1932 }
1933
1934 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
1935                           const int16_t *filter, const int16_t *filterPos,
1936                           int filterSize)
1937 {
1938     int i;
1939     int32_t *dst = (int32_t *) _dst;
1940     for (i=0; i<dstW; i++) {
1941         int j;
1942         int srcPos= filterPos[i];
1943         int val=0;
1944         for (j=0; j<filterSize; j++) {
1945             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1946         }
1947         //filter += hFilterSize;
1948         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
1949         //dst[i] = val>>7;
1950     }
1951 }
1952
1953 //FIXME all pal and rgb srcFormats could do this convertion as well
1954 //FIXME all scalers more complex than bilinear could do half of this transform
1955 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1956 {
1957     int i;
1958     for (i = 0; i < width; i++) {
1959         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1960         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1961     }
1962 }
1963 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1964 {
1965     int i;
1966     for (i = 0; i < width; i++) {
1967         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1968         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1969     }
1970 }
1971 static void lumRangeToJpeg_c(int16_t *dst, int width)
1972 {
1973     int i;
1974     for (i = 0; i < width; i++)
1975         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1976 }
1977 static void lumRangeFromJpeg_c(int16_t *dst, int width)
1978 {
1979     int i;
1980     for (i = 0; i < width; i++)
1981         dst[i] = (dst[i]*14071 + 33561947)>>14;
1982 }
1983
1984 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
1985 {
1986     int i;
1987     int32_t *dstU = (int32_t *) _dstU;
1988     int32_t *dstV = (int32_t *) _dstV;
1989     for (i = 0; i < width; i++) {
1990         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
1991         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
1992     }
1993 }
1994 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
1995 {
1996     int i;
1997     int32_t *dstU = (int32_t *) _dstU;
1998     int32_t *dstV = (int32_t *) _dstV;
1999     for (i = 0; i < width; i++) {
2000         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2001         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2002     }
2003 }
2004 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2005 {
2006     int i;
2007     int32_t *dst = (int32_t *) _dst;
2008     for (i = 0; i < width; i++)
2009         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2010 }
2011 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2012 {
2013     int i;
2014     int32_t *dst = (int32_t *) _dst;
2015     for (i = 0; i < width; i++)
2016         dst[i] = (dst[i]*14071 + (33561947<<4))>>14;
2017 }
2018
2019 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2020                            const uint8_t *src, int srcW, int xInc)
2021 {
2022     int i;
2023     unsigned int xpos=0;
2024     for (i=0;i<dstWidth;i++) {
2025         register unsigned int xx=xpos>>16;
2026         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2027         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2028         xpos+=xInc;
2029     }
2030 }
2031
2032 // *** horizontal scale Y line to temp buffer
2033 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2034                                      const uint8_t *src_in[4], int srcW, int xInc,
2035                                      const int16_t *hLumFilter,
2036                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2037                                      uint8_t *formatConvBuffer,
2038                                      uint32_t *pal, int isAlpha)
2039 {
2040     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2041     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2042     const uint8_t *src = src_in[isAlpha ? 3 : 0];
2043
2044     if (toYV12) {
2045         toYV12(formatConvBuffer, src, srcW, pal);
2046         src= formatConvBuffer;
2047     } else if (c->readLumPlanar && !isAlpha) {
2048         c->readLumPlanar(formatConvBuffer, src_in, srcW);
2049         src = formatConvBuffer;
2050     }
2051
2052     if (!c->hyscale_fast) {
2053         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2054     } else { // fast bilinear upscale / crap downscale
2055         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2056     }
2057
2058     if (convertRange)
2059         convertRange(dst, dstWidth);
2060 }
2061
2062 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2063                            int dstWidth, const uint8_t *src1,
2064                            const uint8_t *src2, int srcW, int xInc)
2065 {
2066     int i;
2067     unsigned int xpos=0;
2068     for (i=0;i<dstWidth;i++) {
2069         register unsigned int xx=xpos>>16;
2070         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2071         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2072         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2073         xpos+=xInc;
2074     }
2075 }
2076
2077 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2078                                      const uint8_t *src_in[4],
2079                                      int srcW, int xInc, const int16_t *hChrFilter,
2080                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2081                                      uint8_t *formatConvBuffer, uint32_t *pal)
2082 {
2083     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
2084     if (c->chrToYV12) {
2085         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
2086         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2087         src1= formatConvBuffer;
2088         src2= buf2;
2089     } else if (c->readChrPlanar) {
2090         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
2091         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
2092         src1= formatConvBuffer;
2093         src2= buf2;
2094     }
2095
2096     if (!c->hcscale_fast) {
2097         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2098         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2099     } else { // fast bilinear upscale / crap downscale
2100         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2101     }
2102
2103     if (c->chrConvertRange)
2104         c->chrConvertRange(dst1, dst2, dstWidth);
2105 }
2106
2107 static av_always_inline void
2108 find_c_packed_planar_out_funcs(SwsContext *c,
2109                                yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX,
2110                                yuv2interleavedX_fn *yuv2nv12cX,
2111                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2112                                yuv2packedX_fn *yuv2packedX)
2113 {
2114     enum PixelFormat dstFormat = c->dstFormat;
2115
2116     if (is16BPS(dstFormat)) {
2117         *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c  : yuv2planeX_16LE_c;
2118         *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c  : yuv2plane1_16LE_c;
2119     } else if (is9_OR_10BPS(dstFormat)) {
2120         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2121             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c  : yuv2planeX_9LE_c;
2122             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c  : yuv2plane1_9LE_c;
2123         } else {
2124             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c  : yuv2planeX_10LE_c;
2125             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c  : yuv2plane1_10LE_c;
2126         }
2127     } else {
2128         *yuv2plane1 = yuv2plane1_8_c;
2129         *yuv2planeX = yuv2planeX_8_c;
2130         if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)
2131             *yuv2nv12cX = yuv2nv12cX_c;
2132     }
2133
2134     if(c->flags & SWS_FULL_CHR_H_INT) {
2135         switch (dstFormat) {
2136             case PIX_FMT_RGBA:
2137 #if CONFIG_SMALL
2138                 *yuv2packedX = yuv2rgba32_full_X_c;
2139 #else
2140 #if CONFIG_SWSCALE_ALPHA
2141                 if (c->alpPixBuf) {
2142                     *yuv2packedX = yuv2rgba32_full_X_c;
2143                 } else
2144 #endif /* CONFIG_SWSCALE_ALPHA */
2145                 {
2146                     *yuv2packedX = yuv2rgbx32_full_X_c;
2147                 }
2148 #endif /* !CONFIG_SMALL */
2149                 break;
2150             case PIX_FMT_ARGB:
2151 #if CONFIG_SMALL
2152                 *yuv2packedX = yuv2argb32_full_X_c;
2153 #else
2154 #if CONFIG_SWSCALE_ALPHA
2155                 if (c->alpPixBuf) {
2156                     *yuv2packedX = yuv2argb32_full_X_c;
2157                 } else
2158 #endif /* CONFIG_SWSCALE_ALPHA */
2159                 {
2160                     *yuv2packedX = yuv2xrgb32_full_X_c;
2161                 }
2162 #endif /* !CONFIG_SMALL */
2163                 break;
2164             case PIX_FMT_BGRA:
2165 #if CONFIG_SMALL
2166                 *yuv2packedX = yuv2bgra32_full_X_c;
2167 #else
2168 #if CONFIG_SWSCALE_ALPHA
2169                 if (c->alpPixBuf) {
2170                     *yuv2packedX = yuv2bgra32_full_X_c;
2171                 } else
2172 #endif /* CONFIG_SWSCALE_ALPHA */
2173                 {
2174                     *yuv2packedX = yuv2bgrx32_full_X_c;
2175                 }
2176 #endif /* !CONFIG_SMALL */
2177                 break;
2178             case PIX_FMT_ABGR:
2179 #if CONFIG_SMALL
2180                 *yuv2packedX = yuv2abgr32_full_X_c;
2181 #else
2182 #if CONFIG_SWSCALE_ALPHA
2183                 if (c->alpPixBuf) {
2184                     *yuv2packedX = yuv2abgr32_full_X_c;
2185                 } else
2186 #endif /* CONFIG_SWSCALE_ALPHA */
2187                 {
2188                     *yuv2packedX = yuv2xbgr32_full_X_c;
2189                 }
2190 #endif /* !CONFIG_SMALL */
2191                 break;
2192             case PIX_FMT_RGB24:
2193             *yuv2packedX = yuv2rgb24_full_X_c;
2194             break;
2195         case PIX_FMT_BGR24:
2196             *yuv2packedX = yuv2bgr24_full_X_c;
2197             break;
2198         }
2199     } else {
2200         switch (dstFormat) {
2201         case PIX_FMT_RGB48LE:
2202             *yuv2packed1 = yuv2rgb48le_1_c;
2203             *yuv2packed2 = yuv2rgb48le_2_c;
2204             *yuv2packedX = yuv2rgb48le_X_c;
2205             break;
2206         case PIX_FMT_RGB48BE:
2207             *yuv2packed1 = yuv2rgb48be_1_c;
2208             *yuv2packed2 = yuv2rgb48be_2_c;
2209             *yuv2packedX = yuv2rgb48be_X_c;
2210             break;
2211         case PIX_FMT_BGR48LE:
2212             *yuv2packed1 = yuv2bgr48le_1_c;
2213             *yuv2packed2 = yuv2bgr48le_2_c;
2214             *yuv2packedX = yuv2bgr48le_X_c;
2215             break;
2216         case PIX_FMT_BGR48BE:
2217             *yuv2packed1 = yuv2bgr48be_1_c;
2218             *yuv2packed2 = yuv2bgr48be_2_c;
2219             *yuv2packedX = yuv2bgr48be_X_c;
2220             break;
2221         case PIX_FMT_RGB32:
2222         case PIX_FMT_BGR32:
2223 #if CONFIG_SMALL
2224             *yuv2packed1 = yuv2rgb32_1_c;
2225             *yuv2packed2 = yuv2rgb32_2_c;
2226             *yuv2packedX = yuv2rgb32_X_c;
2227 #else
2228 #if CONFIG_SWSCALE_ALPHA
2229                 if (c->alpPixBuf) {
2230                     *yuv2packed1 = yuv2rgba32_1_c;
2231                     *yuv2packed2 = yuv2rgba32_2_c;
2232                     *yuv2packedX = yuv2rgba32_X_c;
2233                 } else
2234 #endif /* CONFIG_SWSCALE_ALPHA */
2235                 {
2236                     *yuv2packed1 = yuv2rgbx32_1_c;
2237                     *yuv2packed2 = yuv2rgbx32_2_c;
2238                     *yuv2packedX = yuv2rgbx32_X_c;
2239                 }
2240 #endif /* !CONFIG_SMALL */
2241             break;
2242         case PIX_FMT_RGB32_1:
2243         case PIX_FMT_BGR32_1:
2244 #if CONFIG_SMALL
2245                 *yuv2packed1 = yuv2rgb32_1_1_c;
2246                 *yuv2packed2 = yuv2rgb32_1_2_c;
2247                 *yuv2packedX = yuv2rgb32_1_X_c;
2248 #else
2249 #if CONFIG_SWSCALE_ALPHA
2250                 if (c->alpPixBuf) {
2251                     *yuv2packed1 = yuv2rgba32_1_1_c;
2252                     *yuv2packed2 = yuv2rgba32_1_2_c;
2253                     *yuv2packedX = yuv2rgba32_1_X_c;
2254                 } else
2255 #endif /* CONFIG_SWSCALE_ALPHA */
2256                 {
2257                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2258                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2259                     *yuv2packedX = yuv2rgbx32_1_X_c;
2260                 }
2261 #endif /* !CONFIG_SMALL */
2262                 break;
2263         case PIX_FMT_RGB24:
2264             *yuv2packed1 = yuv2rgb24_1_c;
2265             *yuv2packed2 = yuv2rgb24_2_c;
2266             *yuv2packedX = yuv2rgb24_X_c;
2267             break;
2268         case PIX_FMT_BGR24:
2269             *yuv2packed1 = yuv2bgr24_1_c;
2270             *yuv2packed2 = yuv2bgr24_2_c;
2271             *yuv2packedX = yuv2bgr24_X_c;
2272             break;
2273         case PIX_FMT_RGB565LE:
2274         case PIX_FMT_RGB565BE:
2275         case PIX_FMT_BGR565LE:
2276         case PIX_FMT_BGR565BE:
2277             *yuv2packed1 = yuv2rgb16_1_c;
2278             *yuv2packed2 = yuv2rgb16_2_c;
2279             *yuv2packedX = yuv2rgb16_X_c;
2280             break;
2281         case PIX_FMT_RGB555LE:
2282         case PIX_FMT_RGB555BE:
2283         case PIX_FMT_BGR555LE:
2284         case PIX_FMT_BGR555BE:
2285             *yuv2packed1 = yuv2rgb15_1_c;
2286             *yuv2packed2 = yuv2rgb15_2_c;
2287             *yuv2packedX = yuv2rgb15_X_c;
2288             break;
2289         case PIX_FMT_RGB444LE:
2290         case PIX_FMT_RGB444BE:
2291         case PIX_FMT_BGR444LE:
2292         case PIX_FMT_BGR444BE:
2293             *yuv2packed1 = yuv2rgb12_1_c;
2294             *yuv2packed2 = yuv2rgb12_2_c;
2295             *yuv2packedX = yuv2rgb12_X_c;
2296             break;
2297         case PIX_FMT_RGB8:
2298         case PIX_FMT_BGR8:
2299             *yuv2packed1 = yuv2rgb8_1_c;
2300             *yuv2packed2 = yuv2rgb8_2_c;
2301             *yuv2packedX = yuv2rgb8_X_c;
2302             break;
2303         case PIX_FMT_RGB4:
2304         case PIX_FMT_BGR4:
2305             *yuv2packed1 = yuv2rgb4_1_c;
2306             *yuv2packed2 = yuv2rgb4_2_c;
2307             *yuv2packedX = yuv2rgb4_X_c;
2308             break;
2309         case PIX_FMT_RGB4_BYTE:
2310         case PIX_FMT_BGR4_BYTE:
2311             *yuv2packed1 = yuv2rgb4b_1_c;
2312             *yuv2packed2 = yuv2rgb4b_2_c;
2313             *yuv2packedX = yuv2rgb4b_X_c;
2314             break;
2315         }
2316     }
2317     switch (dstFormat) {
2318     case PIX_FMT_GRAY16BE:
2319         *yuv2packed1 = yuv2gray16BE_1_c;
2320         *yuv2packed2 = yuv2gray16BE_2_c;
2321         *yuv2packedX = yuv2gray16BE_X_c;
2322         break;
2323     case PIX_FMT_GRAY16LE:
2324         *yuv2packed1 = yuv2gray16LE_1_c;
2325         *yuv2packed2 = yuv2gray16LE_2_c;
2326         *yuv2packedX = yuv2gray16LE_X_c;
2327         break;
2328     case PIX_FMT_MONOWHITE:
2329         *yuv2packed1 = yuv2monowhite_1_c;
2330         *yuv2packed2 = yuv2monowhite_2_c;
2331         *yuv2packedX = yuv2monowhite_X_c;
2332         break;
2333     case PIX_FMT_MONOBLACK:
2334         *yuv2packed1 = yuv2monoblack_1_c;
2335         *yuv2packed2 = yuv2monoblack_2_c;
2336         *yuv2packedX = yuv2monoblack_X_c;
2337         break;
2338     case PIX_FMT_YUYV422:
2339         *yuv2packed1 = yuv2yuyv422_1_c;
2340         *yuv2packed2 = yuv2yuyv422_2_c;
2341         *yuv2packedX = yuv2yuyv422_X_c;
2342         break;
2343     case PIX_FMT_UYVY422:
2344         *yuv2packed1 = yuv2uyvy422_1_c;
2345         *yuv2packed2 = yuv2uyvy422_2_c;
2346         *yuv2packedX = yuv2uyvy422_X_c;
2347         break;
2348     }
2349 }
2350
2351 #define DEBUG_SWSCALE_BUFFERS 0
2352 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2353
2354 static int swScale(SwsContext *c, const uint8_t* src[],
2355                    int srcStride[], int srcSliceY,
2356                    int srcSliceH, uint8_t* dst[], int dstStride[])
2357 {
2358     /* load a few things into local vars to make the code more readable? and faster */
2359     const int srcW= c->srcW;
2360     const int dstW= c->dstW;
2361     const int dstH= c->dstH;
2362     const int chrDstW= c->chrDstW;
2363     const int chrSrcW= c->chrSrcW;
2364     const int lumXInc= c->lumXInc;
2365     const int chrXInc= c->chrXInc;
2366     const enum PixelFormat dstFormat= c->dstFormat;
2367     const int flags= c->flags;
2368     int16_t *vLumFilterPos= c->vLumFilterPos;
2369     int16_t *vChrFilterPos= c->vChrFilterPos;
2370     int16_t *hLumFilterPos= c->hLumFilterPos;
2371     int16_t *hChrFilterPos= c->hChrFilterPos;
2372     int16_t *vLumFilter= c->vLumFilter;
2373     int16_t *vChrFilter= c->vChrFilter;
2374     int16_t *hLumFilter= c->hLumFilter;
2375     int16_t *hChrFilter= c->hChrFilter;
2376     int32_t *lumMmxFilter= c->lumMmxFilter;
2377     int32_t *chrMmxFilter= c->chrMmxFilter;
2378     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2379     const int vLumFilterSize= c->vLumFilterSize;
2380     const int vChrFilterSize= c->vChrFilterSize;
2381     const int hLumFilterSize= c->hLumFilterSize;
2382     const int hChrFilterSize= c->hChrFilterSize;
2383     int16_t **lumPixBuf= c->lumPixBuf;
2384     int16_t **chrUPixBuf= c->chrUPixBuf;
2385     int16_t **chrVPixBuf= c->chrVPixBuf;
2386     int16_t **alpPixBuf= c->alpPixBuf;
2387     const int vLumBufSize= c->vLumBufSize;
2388     const int vChrBufSize= c->vChrBufSize;
2389     uint8_t *formatConvBuffer= c->formatConvBuffer;
2390     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2391     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2392     int lastDstY;
2393     uint32_t *pal=c->pal_yuv;
2394     yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
2395     yuv2planarX_fn yuv2planeX = c->yuv2planeX;
2396     yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
2397     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2398     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2399     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2400     int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat);
2401
2402     /* vars which will change and which we need to store back in the context */
2403     int dstY= c->dstY;
2404     int lumBufIndex= c->lumBufIndex;
2405     int chrBufIndex= c->chrBufIndex;
2406     int lastInLumBuf= c->lastInLumBuf;
2407     int lastInChrBuf= c->lastInChrBuf;
2408
2409     if (isPacked(c->srcFormat)) {
2410         src[0]=
2411         src[1]=
2412         src[2]=
2413         src[3]= src[0];
2414         srcStride[0]=
2415         srcStride[1]=
2416         srcStride[2]=
2417         srcStride[3]= srcStride[0];
2418     }
2419     srcStride[1]<<= c->vChrDrop;
2420     srcStride[2]<<= c->vChrDrop;
2421
2422     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2423                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2424                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2425     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2426                    srcSliceY,    srcSliceH,    dstY,    dstH);
2427     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2428                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2429
2430     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2431         static int warnedAlready=0; //FIXME move this into the context perhaps
2432         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2433             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2434                    "         ->cannot do aligned memory accesses anymore\n");
2435             warnedAlready=1;
2436         }
2437     }
2438
2439     /* Note the user might start scaling the picture in the middle so this
2440        will not get executed. This is not really intended but works
2441        currently, so people might do it. */
2442     if (srcSliceY ==0) {
2443         lumBufIndex=-1;
2444         chrBufIndex=-1;
2445         dstY=0;
2446         lastInLumBuf= -1;
2447         lastInChrBuf= -1;
2448     }
2449
2450     if (!should_dither) {
2451         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2452     }
2453     lastDstY= dstY;
2454
2455     for (;dstY < dstH; dstY++) {
2456         const int chrDstY= dstY>>c->chrDstVSubSample;
2457         uint8_t *dest[4] = {
2458             dst[0] + dstStride[0] * dstY,
2459             dst[1] + dstStride[1] * chrDstY,
2460             dst[2] + dstStride[2] * chrDstY,
2461             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2462         };
2463
2464         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2465         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2466         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2467
2468         // Last line needed as input
2469         int lastLumSrcY  = FFMIN(c->srcH,    firstLumSrcY  + vLumFilterSize) - 1;
2470         int lastLumSrcY2 = FFMIN(c->srcH,    firstLumSrcY2 + vLumFilterSize) - 1;
2471         int lastChrSrcY  = FFMIN(c->chrSrcH, firstChrSrcY  + vChrFilterSize) - 1;
2472         int enough_lines;
2473
2474         //handle holes (FAST_BILINEAR & weird filters)
2475         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2476         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2477         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2478         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2479
2480         DEBUG_BUFFERS("dstY: %d\n", dstY);
2481         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2482                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2483         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2484                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2485
2486         // Do we have enough lines in this slice to output the dstY line
2487         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2488
2489         if (!enough_lines) {
2490             lastLumSrcY = srcSliceY + srcSliceH - 1;
2491             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2492             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2493                                             lastLumSrcY, lastChrSrcY);
2494         }
2495
2496         //Do horizontal scaling
2497         while(lastInLumBuf < lastLumSrcY) {
2498             const uint8_t *src1[4] = {
2499                 src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
2500                 src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
2501                 src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
2502                 src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
2503             };
2504             lumBufIndex++;
2505             assert(lumBufIndex < 2*vLumBufSize);
2506             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2507             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2508             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2509                     hLumFilter, hLumFilterPos, hLumFilterSize,
2510                     formatConvBuffer,
2511                     pal, 0);
2512             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2513                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src1, srcW,
2514                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2515                         formatConvBuffer,
2516                         pal, 1);
2517             lastInLumBuf++;
2518             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2519                                lumBufIndex,    lastInLumBuf);
2520         }
2521         while(lastInChrBuf < lastChrSrcY) {
2522             const uint8_t *src1[4] = {
2523                 src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
2524                 src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
2525                 src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
2526                 src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
2527             };
2528             chrBufIndex++;
2529             assert(chrBufIndex < 2*vChrBufSize);
2530             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2531             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2532             //FIXME replace parameters through context struct (some at least)
2533
2534             if (c->needs_hcscale)
2535                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2536                           chrDstW, src1, chrSrcW, chrXInc,
2537                           hChrFilter, hChrFilterPos, hChrFilterSize,
2538                           formatConvBuffer, pal);
2539             lastInChrBuf++;
2540             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2541                                chrBufIndex,    lastInChrBuf);
2542         }
2543         //wrap buf index around to stay inside the ring buffer
2544         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2545         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2546         if (!enough_lines)
2547             break; //we can't output a dstY line so let's try with the next slice
2548
2549 #if HAVE_MMX
2550         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2551 #endif
2552         if (should_dither) {
2553             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2554             c->lumDither8 = dither_8x8_128[dstY & 7];
2555         }
2556         if (dstY >= dstH-2) {
2557             // hmm looks like we can't use MMX here without overwriting this array's tail
2558             find_c_packed_planar_out_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
2559                                            &yuv2packed1, &yuv2packed2, &yuv2packedX);
2560         }
2561
2562         {
2563             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2564             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2565             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2566             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2567
2568             if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
2569                 const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
2570                 int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
2571                 for (i = 0; i < neg;            i++)
2572                     tmpY[i] = lumSrcPtr[neg];
2573                 for (     ; i < end;            i++)
2574                     tmpY[i] = lumSrcPtr[i];
2575                 for (     ; i < vLumFilterSize; i++)
2576                     tmpY[i] = tmpY[i-1];
2577                 lumSrcPtr = tmpY;
2578
2579                 if (alpSrcPtr) {
2580                     const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
2581                     for (i = 0; i < neg;            i++)
2582                         tmpA[i] = alpSrcPtr[neg];
2583                     for (     ; i < end;            i++)
2584                         tmpA[i] = alpSrcPtr[i];
2585                     for (     ; i < vLumFilterSize; i++)
2586                         tmpA[i] = tmpA[i - 1];
2587                     alpSrcPtr = tmpA;
2588                 }
2589             }
2590             if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
2591                 const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize,
2592                               **tmpV = (const int16_t **) chrVPixBuf + 2 * vChrBufSize;
2593                 int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
2594                 for (i = 0; i < neg;            i++) {
2595                     tmpU[i] = chrUSrcPtr[neg];
2596                     tmpV[i] = chrVSrcPtr[neg];
2597                 }
2598                 for (     ; i < end;            i++) {
2599                     tmpU[i] = chrUSrcPtr[i];
2600                     tmpV[i] = chrVSrcPtr[i];
2601                 }
2602                 for (     ; i < vChrFilterSize; i++) {
2603                     tmpU[i] = tmpU[i - 1];
2604                     tmpV[i] = tmpV[i - 1];
2605                 }
2606                 chrUSrcPtr = tmpU;
2607                 chrVSrcPtr = tmpV;
2608             }
2609
2610             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2611                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2612
2613                 if (vLumFilterSize == 1) {
2614                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
2615                 } else {
2616                     yuv2planeX(vLumFilter + dstY * vLumFilterSize, vLumFilterSize,
2617                                lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
2618                 }
2619
2620                 if (!((dstY&chrSkipMask) || isGray(dstFormat))) {
2621                     if (yuv2nv12cX) {
2622                         yuv2nv12cX(c, vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
2623                     } else if (vChrFilterSize == 1) {
2624                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
2625                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
2626                     } else {
2627                         yuv2planeX(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize,
2628                                    chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
2629                         yuv2planeX(vChrFilter + chrDstY * vChrFilterSize, vChrFilterSize,
2630                                    chrVSrcPtr, dest[2], chrDstW, c->chrDither8, 3);
2631                     }
2632                 }
2633
2634                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2635                     if (vLumFilterSize == 1) {
2636                         yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0);
2637                     } else {
2638                         yuv2planeX(vLumFilter + dstY * vLumFilterSize, vLumFilterSize,
2639                                    alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
2640                     }
2641                 }
2642             } else {
2643                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2644                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2645                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2646                     int chrAlpha = vChrFilter[2 * dstY + 1];
2647                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2648                                 alpPixBuf ? *alpSrcPtr : NULL,
2649                                 dest[0], dstW, chrAlpha, dstY);
2650                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2651                     int lumAlpha = vLumFilter[2 * dstY + 1];
2652                     int chrAlpha = vChrFilter[2 * dstY + 1];
2653                     lumMmxFilter[2] =
2654                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2655                     chrMmxFilter[2] =
2656                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2657                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2658                                 alpPixBuf ? alpSrcPtr : NULL,
2659                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2660                 } else { //general RGB
2661                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2662                                 lumSrcPtr, vLumFilterSize,
2663                                 vChrFilter + dstY * vChrFilterSize,
2664                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2665                                 alpSrcPtr, dest[0], dstW, dstY);
2666                 }
2667             }
2668         }
2669     }
2670
2671     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2672         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2673
2674 #if HAVE_MMX2
2675     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2676         __asm__ volatile("sfence":::"memory");
2677 #endif
2678     emms_c();
2679
2680     /* store changed local vars back in the context */
2681     c->dstY= dstY;
2682     c->lumBufIndex= lumBufIndex;
2683     c->chrBufIndex= chrBufIndex;
2684     c->lastInLumBuf= lastInLumBuf;
2685     c->lastInChrBuf= lastInChrBuf;
2686
2687     return dstY - lastDstY;
2688 }
2689
2690 static av_cold void sws_init_swScale_c(SwsContext *c)
2691 {
2692     enum PixelFormat srcFormat = c->srcFormat;
2693
2694     find_c_packed_planar_out_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
2695                                    &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2,
2696                                    &c->yuv2packedX);
2697
2698     c->chrToYV12 = NULL;
2699     switch(srcFormat) {
2700         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2701         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2702         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2703         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2704         case PIX_FMT_RGB8     :
2705         case PIX_FMT_BGR8     :
2706         case PIX_FMT_PAL8     :
2707         case PIX_FMT_BGR4_BYTE:
2708         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2709         case PIX_FMT_GBRP9LE:
2710         case PIX_FMT_GBRP10LE:
2711         case PIX_FMT_GBRP16LE:  c->readChrPlanar = planar_rgb16le_to_uv; break;
2712         case PIX_FMT_GBRP9BE:
2713         case PIX_FMT_GBRP10BE:
2714         case PIX_FMT_GBRP16BE:  c->readChrPlanar = planar_rgb16be_to_uv; break;
2715         case PIX_FMT_GBRP:      c->readChrPlanar = planar_rgb_to_uv; break;
2716 #if HAVE_BIGENDIAN
2717         case PIX_FMT_YUV444P9LE:
2718         case PIX_FMT_YUV422P9LE:
2719         case PIX_FMT_YUV420P9LE:
2720         case PIX_FMT_YUV422P10LE:
2721         case PIX_FMT_YUV444P10LE:
2722         case PIX_FMT_YUV420P10LE:
2723         case PIX_FMT_YUV420P16LE:
2724         case PIX_FMT_YUV422P16LE:
2725         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2726 #else
2727         case PIX_FMT_YUV444P9BE:
2728         case PIX_FMT_YUV422P9BE:
2729         case PIX_FMT_YUV420P9BE:
2730         case PIX_FMT_YUV444P10BE:
2731         case PIX_FMT_YUV422P10BE:
2732         case PIX_FMT_YUV420P10BE:
2733         case PIX_FMT_YUV420P16BE:
2734         case PIX_FMT_YUV422P16BE:
2735         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2736 #endif
2737     }
2738     if (c->chrSrcHSubSample) {
2739         switch(srcFormat) {
2740         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2741         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2742         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2743         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2744         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2745         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2746         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2747         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2748         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2749         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2750         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2751         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2752         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2753         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2754         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2755         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2756         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2757         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2758         }
2759     } else {
2760         switch(srcFormat) {
2761         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2762         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2763         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2764         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2765         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2766         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2767         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2768         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2769         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2770         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2771         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2772         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2773         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2774         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2775         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2776         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2777         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2778         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2779         }
2780     }
2781
2782     c->lumToYV12 = NULL;
2783     c->alpToYV12 = NULL;
2784     switch (srcFormat) {
2785     case PIX_FMT_GBRP9LE:
2786     case PIX_FMT_GBRP10LE:
2787     case PIX_FMT_GBRP16LE: c->readLumPlanar = planar_rgb16le_to_y; break;
2788     case PIX_FMT_GBRP9BE:
2789     case PIX_FMT_GBRP10BE:
2790     case PIX_FMT_GBRP16BE: c->readLumPlanar = planar_rgb16be_to_y; break;
2791     case PIX_FMT_GBRP:     c->readLumPlanar = planar_rgb_to_y; break;
2792 #if HAVE_BIGENDIAN
2793     case PIX_FMT_YUV444P9LE:
2794     case PIX_FMT_YUV422P9LE:
2795     case PIX_FMT_YUV420P9LE:
2796     case PIX_FMT_YUV444P10LE:
2797     case PIX_FMT_YUV422P10LE:
2798     case PIX_FMT_YUV420P10LE:
2799     case PIX_FMT_YUV420P16LE:
2800     case PIX_FMT_YUV422P16LE:
2801     case PIX_FMT_YUV444P16LE:
2802     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2803 #else
2804     case PIX_FMT_YUV444P9BE:
2805     case PIX_FMT_YUV422P9BE:
2806     case PIX_FMT_YUV420P9BE:
2807     case PIX_FMT_YUV444P10BE:
2808     case PIX_FMT_YUV422P10BE:
2809     case PIX_FMT_YUV420P10BE:
2810     case PIX_FMT_YUV420P16BE:
2811     case PIX_FMT_YUV422P16BE:
2812     case PIX_FMT_YUV444P16BE:
2813     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2814 #endif
2815     case PIX_FMT_YUYV422  :
2816     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2817     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2818     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2819     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2820     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2821     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2822     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2823     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2824     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2825     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2826     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2827     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2828     case PIX_FMT_RGB8     :
2829     case PIX_FMT_BGR8     :
2830     case PIX_FMT_PAL8     :
2831     case PIX_FMT_BGR4_BYTE:
2832     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2833     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2834     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2835     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2836     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2837     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2838     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2839     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2840     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2841     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2842     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2843     }
2844     if (c->alpPixBuf) {
2845         switch (srcFormat) {
2846         case PIX_FMT_BGRA:
2847         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2848         case PIX_FMT_ABGR:
2849         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2850         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2851         }
2852     }
2853
2854     if (c->srcBpc == 8) {
2855         if (c->dstBpc <= 10) {
2856             c->hyScale = c->hcScale = hScale8To15_c;
2857             if (c->flags & SWS_FAST_BILINEAR) {
2858                 c->hyscale_fast = hyscale_fast_c;
2859                 c->hcscale_fast = hcscale_fast_c;
2860             }
2861         } else {
2862             c->hyScale = c->hcScale = hScale8To19_c;
2863         }
2864     } else {
2865         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
2866     }
2867
2868     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2869         if (c->dstBpc <= 10) {
2870             if (c->srcRange) {
2871                 c->lumConvertRange = lumRangeFromJpeg_c;
2872                 c->chrConvertRange = chrRangeFromJpeg_c;
2873             } else {
2874                 c->lumConvertRange = lumRangeToJpeg_c;
2875                 c->chrConvertRange = chrRangeToJpeg_c;
2876             }
2877         } else {
2878             if (c->srcRange) {
2879                 c->lumConvertRange = lumRangeFromJpeg16_c;
2880                 c->chrConvertRange = chrRangeFromJpeg16_c;
2881             } else {
2882                 c->lumConvertRange = lumRangeToJpeg16_c;
2883                 c->chrConvertRange = chrRangeToJpeg16_c;
2884             }
2885         }
2886     }
2887
2888     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2889           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2890         c->needs_hcscale = 1;
2891 }
2892
2893 SwsFunc ff_getSwsFunc(SwsContext *c)
2894 {
2895     sws_init_swScale_c(c);
2896
2897     if (HAVE_MMX)
2898         ff_sws_init_swScale_mmx(c);
2899     if (HAVE_ALTIVEC)
2900         ff_sws_init_swScale_altivec(c);
2901
2902     return swScale;
2903 }