git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include <inttypes.h>
  22 #include <string.h>
  23 #include <math.h>
  24 #include <stdio.h>
  25 #include "config.h"
  26 #include <assert.h>
  27 #include "swscale.h"
  28 #include "swscale_internal.h"
  29 #include "rgb2rgb.h"
  30 #include "libavutil/avassert.h"
  31 #include "libavutil/intreadwrite.h"
  32 #include "libavutil/cpu.h"
  33 #include "libavutil/avutil.h"
  34 #include "libavutil/mathematics.h"
  35 #include "libavutil/bswap.h"
  36 #include "libavutil/pixdesc.h"
  37
  38
  39 #define RGB2YUV_SHIFT 15
  40 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  41 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  42 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  43 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  44 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  45 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  46 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  47 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  48 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  49
  50 /*
  51 NOTES
  52 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  53
  54 TODO
  55 more intelligent misalignment avoidance for the horizontal scaler
  56 write special vertical cubic upscale version
  57 optimize C code (YV12 / minmax)
  58 add support for packed pixel YUV input & output
  59 add support for Y8 output
  60 optimize BGR24 & BGR32
  61 add BGR4 output support
  62 write special BGR->BGR scaler
  63 */
  64
  65 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  66 {  1,   3,   1,   3,   1,   3,   1,   3, },
  67 {  2,   0,   2,   0,   2,   0,   2,   0, },
  68 };
  69
  70 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
  71 {  6,   2,   6,   2,   6,   2,   6,   2, },
  72 {  0,   4,   0,   4,   0,   4,   0,   4, },
  73 };
  74
  75 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
  76 {  8,   4,  11,   7,   8,   4,  11,   7, },
  77 {  2,  14,   1,  13,   2,  14,   1,  13, },
  78 { 10,   6,   9,   5,  10,   6,   9,   5, },
  79 {  0,  12,   3,  15,   0,  12,   3,  15, },
  80 };
  81
  82 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
  83 { 17,   9,  23,  15,  16,   8,  22,  14, },
  84 {  5,  29,   3,  27,   4,  28,   2,  26, },
  85 { 21,  13,  19,  11,  20,  12,  18,  10, },
  86 {  0,  24,   6,  30,   1,  25,   7,  31, },
  87 { 16,   8,  22,  14,  17,   9,  23,  15, },
  88 {  4,  28,   2,  26,   5,  29,   3,  27, },
  89 { 20,  12,  18,  10,  21,  13,  19,  11, },
  90 {  1,  25,   7,  31,   0,  24,   6,  30, },
  91 };
  92
  93 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
  94 {  0,  55,  14,  68,   3,  58,  17,  72, },
  95 { 37,  18,  50,  32,  40,  22,  54,  35, },
  96 {  9,  64,   5,  59,  13,  67,   8,  63, },
  97 { 46,  27,  41,  23,  49,  31,  44,  26, },
  98 {  2,  57,  16,  71,   1,  56,  15,  70, },
  99 { 39,  21,  52,  34,  38,  19,  51,  33, },
 100 { 11,  66,   7,  62,  10,  65,   6,  60, },
 101 { 48,  30,  43,  25,  47,  29,  42,  24, },
 102 };
 103
 104 #if 1
 105 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 106 {117,  62, 158, 103, 113,  58, 155, 100, },
 107 { 34, 199,  21, 186,  31, 196,  17, 182, },
 108 {144,  89, 131,  76, 141,  86, 127,  72, },
 109 {  0, 165,  41, 206,  10, 175,  52, 217, },
 110 {110,  55, 151,  96, 120,  65, 162, 107, },
 111 { 28, 193,  14, 179,  38, 203,  24, 189, },
 112 {138,  83, 124,  69, 148,  93, 134,  79, },
 113 {  7, 172,  48, 213,   3, 168,  45, 210, },
 114 };
 115 #elif 1
 116 // tries to correct a gamma of 1.5
 117 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 118 {  0, 143,  18, 200,   2, 156,  25, 215, },
 119 { 78,  28, 125,  64,  89,  36, 138,  74, },
 120 { 10, 180,   3, 161,  16, 195,   8, 175, },
 121 {109,  51,  93,  38, 121,  60, 105,  47, },
 122 {  1, 152,  23, 210,   0, 147,  20, 205, },
 123 { 85,  33, 134,  71,  81,  30, 130,  67, },
 124 { 14, 190,   6, 171,  12, 185,   5, 166, },
 125 {117,  57, 101,  44, 113,  54,  97,  41, },
 126 };
 127 #elif 1
 128 // tries to correct a gamma of 2.0
 129 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 130 {  0, 124,   8, 193,   0, 140,  12, 213, },
 131 { 55,  14, 104,  42,  66,  19, 119,  52, },
 132 {  3, 168,   1, 145,   6, 187,   3, 162, },
 133 { 86,  31,  70,  21,  99,  39,  82,  28, },
 134 {  0, 134,  11, 206,   0, 129,   9, 200, },
 135 { 62,  17, 114,  48,  58,  16, 109,  45, },
 136 {  5, 181,   2, 157,   4, 175,   1, 151, },
 137 { 95,  36,  78,  26,  90,  34,  74,  24, },
 138 };
 139 #else
 140 // tries to correct a gamma of 2.5
 141 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 142 {  0, 107,   3, 187,   0, 125,   6, 212, },
 143 { 39,   7,  86,  28,  49,  11, 102,  36, },
 144 {  1, 158,   0, 131,   3, 180,   1, 151, },
 145 { 68,  19,  52,  12,  81,  25,  64,  17, },
 146 {  0, 119,   5, 203,   0, 113,   4, 195, },
 147 { 45,   9,  96,  33,  42,   8,  91,  30, },
 148 {  2, 172,   1, 144,   2, 165,   0, 137, },
 149 { 77,  23,  60,  15,  72,  21,  56,  14, },
 150 };
 151 #endif
 152 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 153 {  36, 68, 60, 92, 34, 66, 58, 90,},
 154 { 100,  4,124, 28, 98,  2,122, 26,},
 155 {  52, 84, 44, 76, 50, 82, 42, 74,},
 156 { 116, 20,108, 12,114, 18,106, 10,},
 157 {  32, 64, 56, 88, 38, 70, 62, 94,},
 158 {  96,  0,120, 24,102,  6,126, 30,},
 159 {  48, 80, 40, 72, 54, 86, 46, 78,},
 160 { 112, 16,104,  8,118, 22,110, 14,},
 161 };
 162 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 163 {  64, 64, 64, 64, 64, 64, 64, 64 };
 164
 165 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 166 {
 167   {   0,  1,  0,  1,  0,  1,  0,  1,},
 168   {   1,  0,  1,  0,  1,  0,  1,  0,},
 169   {   0,  1,  0,  1,  0,  1,  0,  1,},
 170   {   1,  0,  1,  0,  1,  0,  1,  0,},
 171   {   0,  1,  0,  1,  0,  1,  0,  1,},
 172   {   1,  0,  1,  0,  1,  0,  1,  0,},
 173   {   0,  1,  0,  1,  0,  1,  0,  1,},
 174   {   1,  0,  1,  0,  1,  0,  1,  0,},
 175 },{
 176   {   1,  2,  1,  2,  1,  2,  1,  2,},
 177   {   3,  0,  3,  0,  3,  0,  3,  0,},
 178   {   1,  2,  1,  2,  1,  2,  1,  2,},
 179   {   3,  0,  3,  0,  3,  0,  3,  0,},
 180   {   1,  2,  1,  2,  1,  2,  1,  2,},
 181   {   3,  0,  3,  0,  3,  0,  3,  0,},
 182   {   1,  2,  1,  2,  1,  2,  1,  2,},
 183   {   3,  0,  3,  0,  3,  0,  3,  0,},
 184 },{
 185   {   2,  4,  3,  5,  2,  4,  3,  5,},
 186   {   6,  0,  7,  1,  6,  0,  7,  1,},
 187   {   3,  5,  2,  4,  3,  5,  2,  4,},
 188   {   7,  1,  6,  0,  7,  1,  6,  0,},
 189   {   2,  4,  3,  5,  2,  4,  3,  5,},
 190   {   6,  0,  7,  1,  6,  0,  7,  1,},
 191   {   3,  5,  2,  4,  3,  5,  2,  4,},
 192   {   7,  1,  6,  0,  7,  1,  6,  0,},
 193 },{
 194   {   4,  8,  7, 11,  4,  8,  7, 11,},
 195   {  12,  0, 15,  3, 12,  0, 15,  3,},
 196   {   6, 10,  5,  9,  6, 10,  5,  9,},
 197   {  14,  2, 13,  1, 14,  2, 13,  1,},
 198   {   4,  8,  7, 11,  4,  8,  7, 11,},
 199   {  12,  0, 15,  3, 12,  0, 15,  3,},
 200   {   6, 10,  5,  9,  6, 10,  5,  9,},
 201   {  14,  2, 13,  1, 14,  2, 13,  1,},
 202 },{
 203   {   9, 17, 15, 23,  8, 16, 14, 22,},
 204   {  25,  1, 31,  7, 24,  0, 30,  6,},
 205   {  13, 21, 11, 19, 12, 20, 10, 18,},
 206   {  29,  5, 27,  3, 28,  4, 26,  2,},
 207   {   8, 16, 14, 22,  9, 17, 15, 23,},
 208   {  24,  0, 30,  6, 25,  1, 31,  7,},
 209   {  12, 20, 10, 18, 13, 21, 11, 19,},
 210   {  28,  4, 26,  2, 29,  5, 27,  3,},
 211 },{
 212   {  18, 34, 30, 46, 17, 33, 29, 45,},
 213   {  50,  2, 62, 14, 49,  1, 61, 13,},
 214   {  26, 42, 22, 38, 25, 41, 21, 37,},
 215   {  58, 10, 54,  6, 57,  9, 53,  5,},
 216   {  16, 32, 28, 44, 19, 35, 31, 47,},
 217   {  48,  0, 60, 12, 51,  3, 63, 15,},
 218   {  24, 40, 20, 36, 27, 43, 23, 39,},
 219   {  56,  8, 52,  4, 59, 11, 55,  7,},
 220 },{
 221   {  18, 34, 30, 46, 17, 33, 29, 45,},
 222   {  50,  2, 62, 14, 49,  1, 61, 13,},
 223   {  26, 42, 22, 38, 25, 41, 21, 37,},
 224   {  58, 10, 54,  6, 57,  9, 53,  5,},
 225   {  16, 32, 28, 44, 19, 35, 31, 47,},
 226   {  48,  0, 60, 12, 51,  3, 63, 15,},
 227   {  24, 40, 20, 36, 27, 43, 23, 39,},
 228   {  56,  8, 52,  4, 59, 11, 55,  7,},
 229 },{
 230   {  36, 68, 60, 92, 34, 66, 58, 90,},
 231   { 100,  4,124, 28, 98,  2,122, 26,},
 232   {  52, 84, 44, 76, 50, 82, 42, 74,},
 233   { 116, 20,108, 12,114, 18,106, 10,},
 234   {  32, 64, 56, 88, 38, 70, 62, 94,},
 235   {  96,  0,120, 24,102,  6,126, 30,},
 236   {  48, 80, 40, 72, 54, 86, 46, 78,},
 237   { 112, 16,104,  8,118, 22,110, 14,},
 238 }};
 239
 240 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 241
 242 const uint16_t dither_scale[15][16]={
 243 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 244 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 245 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 246 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 247 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 248 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 249 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 250 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 251 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 252 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 253 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 254 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 255 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 256 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 257 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 258 };
 259
 260 #define output_pixel(pos, val, bias, signedness) \
 261     if (big_endian) { \
 262         AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 263     } else { \
 264         AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 265     }
 266
 267 static av_always_inline void
 268 yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
 269                          int big_endian, int output_bits)
 270 {
 271     int i;
 272     int shift = 3;
 273     av_assert0(output_bits == 16);
 274
 275     for (i = 0; i < dstW; i++) {
 276         int val = src[i] + (1 << (shift - 1));
 277         output_pixel(&dest[i], val, 0, uint);
 278     }
 279 }
 280
 281 static av_always_inline void
 282 yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
 283                          const int32_t **src, uint16_t *dest, int dstW,
 284                          int big_endian, int output_bits)
 285 {
 286     int i;
 287     int shift = 15;
 288     av_assert0(output_bits == 16);
 289
 290     for (i = 0; i < dstW; i++) {
 291         int val = 1 << (shift - 1);
 292         int j;
 293
 294         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
 295          * filters (or anything with negative coeffs, the range can be slightly
 296          * wider in both directions. To account for this overflow, we subtract
 297          * a constant so it always fits in the signed range (assuming a
 298          * reasonable filterSize), and re-add that at the end. */
 299         val -= 0x40000000;
 300         for (j = 0; j < filterSize; j++)
 301             val += src[j][i] * filter[j];
 302
 303         output_pixel(&dest[i], val, 0x8000, int);
 304     }
 305 }
 306
 307 #undef output_pixel
 308
 309 #define output_pixel(pos, val) \
 310     if (big_endian) { \
 311         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 312     } else { \
 313         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 314     }
 315
 316 static av_always_inline void
 317 yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
 318                          int big_endian, int output_bits)
 319 {
 320     int i;
 321     int shift = 15 - output_bits;
 322
 323     for (i = 0; i < dstW; i++) {
 324         int val = src[i] + (1 << (shift - 1));
 325         output_pixel(&dest[i], val);
 326     }
 327 }
 328
 329 static av_always_inline void
 330 yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
 331                          const int16_t **src, uint16_t *dest, int dstW,
 332                          int big_endian, int output_bits)
 333 {
 334     int i;
 335     int shift = 11 + 16 - output_bits;
 336
 337     for (i = 0; i < dstW; i++) {
 338         int val = 1 << (shift - 1);
 339         int j;
 340
 341         for (j = 0; j < filterSize; j++)
 342             val += src[j][i] * filter[j];
 343
 344         output_pixel(&dest[i], val);
 345     }
 346 }
 347
 348 #undef output_pixel
 349
 350 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
 351 static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
 352                               uint8_t *dest, int dstW, \
 353                               const uint8_t *dither, int offset)\
 354 { \
 355     yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
 356                          (uint16_t *) dest, dstW, is_be, bits); \
 357 }\
 358 static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
 359                               const int16_t **src, uint8_t *dest, int dstW, \
 360                               const uint8_t *dither, int offset)\
 361 { \
 362     yuv2planeX_## template_size ## _c_template(filter, \
 363                          filterSize, (const typeX_t **) src, \
 364                          (uint16_t *) dest, dstW, is_be, bits); \
 365 }
 366 yuv2NBPS( 9, BE, 1, 10, int16_t)
 367 yuv2NBPS( 9, LE, 0, 10, int16_t)
 368 yuv2NBPS(10, BE, 1, 10, int16_t)
 369 yuv2NBPS(10, LE, 0, 10, int16_t)
 370 yuv2NBPS(16, BE, 1, 16, int32_t)
 371 yuv2NBPS(16, LE, 0, 16, int32_t)
 372
 373 static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
 374                            const int16_t **src, uint8_t *dest, int dstW,
 375                            const uint8_t *dither, int offset)
 376 {
 377     int i;
 378     for (i=0; i<dstW; i++) {
 379         int val = dither[(i + offset) & 7] << 12;
 380         int j;
 381         for (j=0; j<filterSize; j++)
 382             val += src[j][i] * filter[j];
 383
 384         dest[i]= av_clip_uint8(val>>19);
 385     }
 386 }
 387
 388 static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
 389                            const uint8_t *dither, int offset)
 390 {
 391     int i;
 392     for (i=0; i<dstW; i++) {
 393         int val = (src[i] + dither[(i + offset) & 7]) >> 7;
 394         dest[i]= av_clip_uint8(val);
 395     }
 396 }
 397
 398 static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
 399                         const int16_t **chrUSrc, const int16_t **chrVSrc,
 400                         uint8_t *dest, int chrDstW)
 401 {
 402     enum PixelFormat dstFormat = c->dstFormat;
 403     const uint8_t *chrDither = c->chrDither8;
 404     int i;
 405
 406     if (dstFormat == PIX_FMT_NV12)
 407         for (i=0; i<chrDstW; i++) {
 408             int u = chrDither[i & 7] << 12;
 409             int v = chrDither[(i + 3) & 7] << 12;
 410             int j;
 411             for (j=0; j<chrFilterSize; j++) {
 412                 u += chrUSrc[j][i] * chrFilter[j];
 413                 v += chrVSrc[j][i] * chrFilter[j];
 414             }
 415
 416             dest[2*i]= av_clip_uint8(u>>19);
 417             dest[2*i+1]= av_clip_uint8(v>>19);
 418         }
 419     else
 420         for (i=0; i<chrDstW; i++) {
 421             int u = chrDither[i & 7] << 12;
 422             int v = chrDither[(i + 3) & 7] << 12;
 423             int j;
 424             for (j=0; j<chrFilterSize; j++) {
 425                 u += chrUSrc[j][i] * chrFilter[j];
 426                 v += chrVSrc[j][i] * chrFilter[j];
 427             }
 428
 429             dest[2*i]= av_clip_uint8(v>>19);
 430             dest[2*i+1]= av_clip_uint8(u>>19);
 431         }
 432 }
 433
 434 #define output_pixel(pos, val) \
 435         if (target == PIX_FMT_GRAY16BE) { \
 436             AV_WB16(pos, val); \
 437         } else { \
 438             AV_WL16(pos, val); \
 439         }
 440
 441 static av_always_inline void
 442 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 443                         const int32_t **lumSrc, int lumFilterSize,
 444                         const int16_t *chrFilter, const int32_t **chrUSrc,
 445                         const int32_t **chrVSrc, int chrFilterSize,
 446                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 447                         int y, enum PixelFormat target)
 448 {
 449     int i;
 450
 451     for (i = 0; i < (dstW >> 1); i++) {
 452         int j;
 453         int Y1 = (1 << 14) - 0x40000000;
 454         int Y2 = (1 << 14) - 0x40000000;
 455
 456         for (j = 0; j < lumFilterSize; j++) {
 457             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 458             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 459         }
 460         Y1 >>= 15;
 461         Y2 >>= 15;
 462         Y1 = av_clip_int16(Y1);
 463         Y2 = av_clip_int16(Y2);
 464         output_pixel(&dest[i * 2 + 0], 0x8000 + Y1);
 465         output_pixel(&dest[i * 2 + 1], 0x8000 + Y2);
 466     }
 467 }
 468
 469 static av_always_inline void
 470 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 471                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 472                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 473                         int yalpha, int uvalpha, int y,
 474                         enum PixelFormat target)
 475 {
 476     int  yalpha1 = 4095 - yalpha;
 477     int i;
 478     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 479
 480     for (i = 0; i < (dstW >> 1); i++) {
 481         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 482         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 483
 484         output_pixel(&dest[i * 2 + 0], Y1);
 485         output_pixel(&dest[i * 2 + 1], Y2);
 486     }
 487 }
 488
 489 static av_always_inline void
 490 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 491                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 492                         const int32_t *abuf0, uint16_t *dest, int dstW,
 493                         int uvalpha, int y, enum PixelFormat target)
 494 {
 495     int i;
 496
 497     for (i = 0; i < (dstW >> 1); i++) {
 498         int Y1 = (buf0[i * 2    ]+4)>>3;
 499         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 500
 501         output_pixel(&dest[i * 2 + 0], Y1);
 502         output_pixel(&dest[i * 2 + 1], Y2);
 503     }
 504 }
 505
 506 #undef output_pixel
 507
 508 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 509 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 510                         const int16_t **_lumSrc, int lumFilterSize, \
 511                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 512                         const int16_t **_chrVSrc, int chrFilterSize, \
 513                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 514                         int y) \
 515 { \
 516     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 517                   **chrUSrc = (const int32_t **) _chrUSrc, \
 518                   **chrVSrc = (const int32_t **) _chrVSrc, \
 519                   **alpSrc  = (const int32_t **) _alpSrc; \
 520     uint16_t *dest = (uint16_t *) _dest; \
 521     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 522                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 523                           alpSrc, dest, dstW, y, fmt); \
 524 } \
 525  \
 526 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 527                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 528                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 529                         int yalpha, int uvalpha, int y) \
 530 { \
 531     const int32_t **buf  = (const int32_t **) _buf, \
 532                   **ubuf = (const int32_t **) _ubuf, \
 533                   **vbuf = (const int32_t **) _vbuf, \
 534                   **abuf = (const int32_t **) _abuf; \
 535     uint16_t *dest = (uint16_t *) _dest; \
 536     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 537                           dest, dstW, yalpha, uvalpha, y, fmt); \
 538 } \
 539  \
 540 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 541                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 542                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 543                         int uvalpha, int y) \
 544 { \
 545     const int32_t *buf0  = (const int32_t *)  _buf0, \
 546                  **ubuf  = (const int32_t **) _ubuf, \
 547                  **vbuf  = (const int32_t **) _vbuf, \
 548                   *abuf0 = (const int32_t *)  _abuf0; \
 549     uint16_t *dest = (uint16_t *) _dest; \
 550     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 551                                   dstW, uvalpha, y, fmt); \
 552 }
 553
 554 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE)
 555 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE)
 556
 557 #define output_pixel(pos, acc) \
 558     if (target == PIX_FMT_MONOBLACK) { \
 559         pos = acc; \
 560     } else { \
 561         pos = ~acc; \
 562     }
 563
 564 static av_always_inline void
 565 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 566                       const int16_t **lumSrc, int lumFilterSize,
 567                       const int16_t *chrFilter, const int16_t **chrUSrc,
 568                       const int16_t **chrVSrc, int chrFilterSize,
 569                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 570                       int y, enum PixelFormat target)
 571 {
 572     const uint8_t * const d128=dither_8x8_220[y&7];
 573     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 574     int i;
 575     unsigned acc = 0;
 576
 577     for (i = 0; i < dstW - 1; i += 2) {
 578         int j;
 579         int Y1 = 1 << 18;
 580         int Y2 = 1 << 18;
 581
 582         for (j = 0; j < lumFilterSize; j++) {
 583             Y1 += lumSrc[j][i]   * lumFilter[j];
 584             Y2 += lumSrc[j][i+1] * lumFilter[j];
 585         }
 586         Y1 >>= 19;
 587         Y2 >>= 19;
 588         if ((Y1 | Y2) & 0x100) {
 589             Y1 = av_clip_uint8(Y1);
 590             Y2 = av_clip_uint8(Y2);
 591         }
 592         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 593         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 594         if ((i & 7) == 6) {
 595             output_pixel(*dest++, acc);
 596         }
 597     }
 598 }
 599
 600 static av_always_inline void
 601 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 602                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 603                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 604                       int yalpha, int uvalpha, int y,
 605                       enum PixelFormat target)
 606 {
 607     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 608     const uint8_t * const d128 = dither_8x8_220[y & 7];
 609     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 610     int  yalpha1 = 4095 - yalpha;
 611     int i;
 612
 613     for (i = 0; i < dstW - 7; i += 8) {
 614         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 615         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 616         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 617         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 618         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 619         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 620         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 621         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 622         output_pixel(*dest++, acc);
 623     }
 624 }
 625
 626 static av_always_inline void
 627 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 628                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 629                       const int16_t *abuf0, uint8_t *dest, int dstW,
 630                       int uvalpha, int y, enum PixelFormat target)
 631 {
 632     const uint8_t * const d128 = dither_8x8_220[y & 7];
 633     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 634     int i;
 635
 636     for (i = 0; i < dstW - 7; i += 8) {
 637         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 638         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 639         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 640         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 641         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 642         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 643         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 644         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 645         output_pixel(*dest++, acc);
 646     }
 647 }
 648
 649 #undef output_pixel
 650
 651 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 652 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 653                                 const int16_t **lumSrc, int lumFilterSize, \
 654                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 655                                 const int16_t **chrVSrc, int chrFilterSize, \
 656                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 657                                 int y) \
 658 { \
 659     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 660                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 661                                   alpSrc, dest, dstW, y, fmt); \
 662 } \
 663  \
 664 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 665                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 666                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 667                                 int yalpha, int uvalpha, int y) \
 668 { \
 669     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 670                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 671 } \
 672  \
 673 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 674                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 675                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 676                                 int uvalpha, int y) \
 677 { \
 678     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 679                                   abuf0, dest, dstW, uvalpha, \
 680                                   y, fmt); \
 681 }
 682
 683 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE)
 684 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK)
 685
 686 #define output_pixels(pos, Y1, U, Y2, V) \
 687     if (target == PIX_FMT_YUYV422) { \
 688         dest[pos + 0] = Y1; \
 689         dest[pos + 1] = U;  \
 690         dest[pos + 2] = Y2; \
 691         dest[pos + 3] = V;  \
 692     } else { \
 693         dest[pos + 0] = U;  \
 694         dest[pos + 1] = Y1; \
 695         dest[pos + 2] = V;  \
 696         dest[pos + 3] = Y2; \
 697     }
 698
 699 static av_always_inline void
 700 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 701                      const int16_t **lumSrc, int lumFilterSize,
 702                      const int16_t *chrFilter, const int16_t **chrUSrc,
 703                      const int16_t **chrVSrc, int chrFilterSize,
 704                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 705                      int y, enum PixelFormat target)
 706 {
 707     int i;
 708
 709     for (i = 0; i < (dstW >> 1); i++) {
 710         int j;
 711         int Y1 = 1 << 18;
 712         int Y2 = 1 << 18;
 713         int U  = 1 << 18;
 714         int V  = 1 << 18;
 715
 716         for (j = 0; j < lumFilterSize; j++) {
 717             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 718             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 719         }
 720         for (j = 0; j < chrFilterSize; j++) {
 721             U += chrUSrc[j][i] * chrFilter[j];
 722             V += chrVSrc[j][i] * chrFilter[j];
 723         }
 724         Y1 >>= 19;
 725         Y2 >>= 19;
 726         U  >>= 19;
 727         V  >>= 19;
 728         if ((Y1 | Y2 | U | V) & 0x100) {
 729             Y1 = av_clip_uint8(Y1);
 730             Y2 = av_clip_uint8(Y2);
 731             U  = av_clip_uint8(U);
 732             V  = av_clip_uint8(V);
 733         }
 734         output_pixels(4*i, Y1, U, Y2, V);
 735     }
 736 }
 737
 738 static av_always_inline void
 739 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 740                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 741                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 742                      int yalpha, int uvalpha, int y,
 743                      enum PixelFormat target)
 744 {
 745     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 746                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 747                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 748     int  yalpha1 = 4095 - yalpha;
 749     int uvalpha1 = 4095 - uvalpha;
 750     int i;
 751
 752     for (i = 0; i < (dstW >> 1); i++) {
 753         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 754         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 755         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 756         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 757
 758         output_pixels(i * 4, Y1, U, Y2, V);
 759     }
 760 }
 761
 762 static av_always_inline void
 763 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 764                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 765                      const int16_t *abuf0, uint8_t *dest, int dstW,
 766                      int uvalpha, int y, enum PixelFormat target)
 767 {
 768     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 769                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 770     int i;
 771
 772     if (uvalpha < 2048) {
 773         for (i = 0; i < (dstW >> 1); i++) {
 774             int Y1 = buf0[i * 2]     >> 7;
 775             int Y2 = buf0[i * 2 + 1] >> 7;
 776             int U  = ubuf1[i]        >> 7;
 777             int V  = vbuf1[i]        >> 7;
 778
 779             output_pixels(i * 4, Y1, U, Y2, V);
 780         }
 781     } else {
 782         for (i = 0; i < (dstW >> 1); i++) {
 783             int Y1 =  buf0[i * 2]          >> 7;
 784             int Y2 =  buf0[i * 2 + 1]      >> 7;
 785             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 786             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 787
 788             output_pixels(i * 4, Y1, U, Y2, V);
 789         }
 790     }
 791 }
 792
 793 #undef output_pixels
 794
 795 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422)
 796 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422)
 797
 798 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 799 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 800 #define output_pixel(pos, val) \
 801     if (isBE(target)) { \
 802         AV_WB16(pos, val); \
 803     } else { \
 804         AV_WL16(pos, val); \
 805     }
 806
 807 static av_always_inline void
 808 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 809                        const int32_t **lumSrc, int lumFilterSize,
 810                        const int16_t *chrFilter, const int32_t **chrUSrc,
 811                        const int32_t **chrVSrc, int chrFilterSize,
 812                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 813                        int y, enum PixelFormat target)
 814 {
 815     int i;
 816
 817     for (i = 0; i < (dstW >> 1); i++) {
 818         int j;
 819         int Y1 = -0x40000000;
 820         int Y2 = -0x40000000;
 821         int U  = -128 << 23; // 19
 822         int V  = -128 << 23;
 823         int R, G, B;
 824
 825         for (j = 0; j < lumFilterSize; j++) {
 826             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 827             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 828         }
 829         for (j = 0; j < chrFilterSize; j++) {
 830             U += chrUSrc[j][i] * chrFilter[j];
 831             V += chrVSrc[j][i] * chrFilter[j];
 832         }
 833
 834         // 8bit: 12+15=27; 16-bit: 12+19=31
 835         Y1 >>= 14; // 10
 836         Y1 += 0x10000;
 837         Y2 >>= 14;
 838         Y2 += 0x10000;
 839         U  >>= 14;
 840         V  >>= 14;
 841
 842         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 843         Y1 -= c->yuv2rgb_y_offset;
 844         Y2 -= c->yuv2rgb_y_offset;
 845         Y1 *= c->yuv2rgb_y_coeff;
 846         Y2 *= c->yuv2rgb_y_coeff;
 847         Y1 += 1 << 13; // 21
 848         Y2 += 1 << 13;
 849         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 850
 851         R = V * c->yuv2rgb_v2r_coeff;
 852         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 853         B =                            U * c->yuv2rgb_u2b_coeff;
 854
 855         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 856         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 857         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 858         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 859         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 860         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 861         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 862         dest += 6;
 863     }
 864 }
 865
 866 static av_always_inline void
 867 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 868                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 869                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 870                        int yalpha, int uvalpha, int y,
 871                        enum PixelFormat target)
 872 {
 873     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 874                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 875                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 876     int  yalpha1 = 4095 - yalpha;
 877     int uvalpha1 = 4095 - uvalpha;
 878     int i;
 879
 880     for (i = 0; i < (dstW >> 1); i++) {
 881         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 882         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 883         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 884         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 885         int R, G, B;
 886
 887         Y1 -= c->yuv2rgb_y_offset;
 888         Y2 -= c->yuv2rgb_y_offset;
 889         Y1 *= c->yuv2rgb_y_coeff;
 890         Y2 *= c->yuv2rgb_y_coeff;
 891         Y1 += 1 << 13;
 892         Y2 += 1 << 13;
 893
 894         R = V * c->yuv2rgb_v2r_coeff;
 895         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 896         B =                            U * c->yuv2rgb_u2b_coeff;
 897
 898         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 899         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 900         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 901         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 902         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 903         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 904         dest += 6;
 905     }
 906 }
 907
 908 static av_always_inline void
 909 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 910                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 911                        const int32_t *abuf0, uint16_t *dest, int dstW,
 912                        int uvalpha, int y, enum PixelFormat target)
 913 {
 914     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 915                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 916     int i;
 917
 918     if (uvalpha < 2048) {
 919         for (i = 0; i < (dstW >> 1); i++) {
 920             int Y1 = (buf0[i * 2]    ) >> 2;
 921             int Y2 = (buf0[i * 2 + 1]) >> 2;
 922             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 923             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 924             int R, G, B;
 925
 926             Y1 -= c->yuv2rgb_y_offset;
 927             Y2 -= c->yuv2rgb_y_offset;
 928             Y1 *= c->yuv2rgb_y_coeff;
 929             Y2 *= c->yuv2rgb_y_coeff;
 930             Y1 += 1 << 13;
 931             Y2 += 1 << 13;
 932
 933             R = V * c->yuv2rgb_v2r_coeff;
 934             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 935             B =                            U * c->yuv2rgb_u2b_coeff;
 936
 937             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 938             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 939             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 940             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 941             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 942             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 943             dest += 6;
 944         }
 945     } else {
 946         for (i = 0; i < (dstW >> 1); i++) {
 947             int Y1 = (buf0[i * 2]    ) >> 2;
 948             int Y2 = (buf0[i * 2 + 1]) >> 2;
 949             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
 950             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
 951             int R, G, B;
 952
 953             Y1 -= c->yuv2rgb_y_offset;
 954             Y2 -= c->yuv2rgb_y_offset;
 955             Y1 *= c->yuv2rgb_y_coeff;
 956             Y2 *= c->yuv2rgb_y_coeff;
 957             Y1 += 1 << 13;
 958             Y2 += 1 << 13;
 959
 960             R = V * c->yuv2rgb_v2r_coeff;
 961             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 962             B =                            U * c->yuv2rgb_u2b_coeff;
 963
 964             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 965             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 966             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 967             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 968             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 969             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 970             dest += 6;
 971         }
 972     }
 973 }
 974
 975 #undef output_pixel
 976 #undef r_b
 977 #undef b_r
 978
 979 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE)
 980 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE)
 981 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE)
 982 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE)
 983
 984 static av_always_inline void
 985 yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
 986               unsigned U, unsigned V, unsigned A1, unsigned A2,
 987               const void *_r, const void *_g, const void *_b, int y,
 988               enum PixelFormat target, int hasAlpha)
 989 {
 990     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 991         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
 992         uint32_t *dest = (uint32_t *) _dest;
 993         const uint32_t *r = (const uint32_t *) _r;
 994         const uint32_t *g = (const uint32_t *) _g;
 995         const uint32_t *b = (const uint32_t *) _b;
 996
 997 #if CONFIG_SMALL
 998         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 999
1000         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1001         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1002 #else
1003         if (hasAlpha) {
1004             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1005
1006             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1007             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1008         } else {
1009             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1010             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1011         }
1012 #endif
1013     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1014         uint8_t *dest = (uint8_t *) _dest;
1015         const uint8_t *r = (const uint8_t *) _r;
1016         const uint8_t *g = (const uint8_t *) _g;
1017         const uint8_t *b = (const uint8_t *) _b;
1018
1019 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1020 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1021
1022         dest[i * 6 + 0] = r_b[Y1];
1023         dest[i * 6 + 1] =   g[Y1];
1024         dest[i * 6 + 2] = b_r[Y1];
1025         dest[i * 6 + 3] = r_b[Y2];
1026         dest[i * 6 + 4] =   g[Y2];
1027         dest[i * 6 + 5] = b_r[Y2];
1028 #undef r_b
1029 #undef b_r
1030     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1031                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1032                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1033         uint16_t *dest = (uint16_t *) _dest;
1034         const uint16_t *r = (const uint16_t *) _r;
1035         const uint16_t *g = (const uint16_t *) _g;
1036         const uint16_t *b = (const uint16_t *) _b;
1037         int dr1, dg1, db1, dr2, dg2, db2;
1038
1039         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1040             dr1 = dither_2x2_8[ y & 1     ][0];
1041             dg1 = dither_2x2_4[ y & 1     ][0];
1042             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1043             dr2 = dither_2x2_8[ y & 1     ][1];
1044             dg2 = dither_2x2_4[ y & 1     ][1];
1045             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1046         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1047             dr1 = dither_2x2_8[ y & 1     ][0];
1048             dg1 = dither_2x2_8[ y & 1     ][1];
1049             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1050             dr2 = dither_2x2_8[ y & 1     ][1];
1051             dg2 = dither_2x2_8[ y & 1     ][0];
1052             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1053         } else {
1054             dr1 = dither_4x4_16[ y & 3     ][0];
1055             dg1 = dither_4x4_16[ y & 3     ][1];
1056             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1057             dr2 = dither_4x4_16[ y & 3     ][1];
1058             dg2 = dither_4x4_16[ y & 3     ][0];
1059             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1060         }
1061
1062         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1063         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1064     } else /* 8/4-bit */ {
1065         uint8_t *dest = (uint8_t *) _dest;
1066         const uint8_t *r = (const uint8_t *) _r;
1067         const uint8_t *g = (const uint8_t *) _g;
1068         const uint8_t *b = (const uint8_t *) _b;
1069         int dr1, dg1, db1, dr2, dg2, db2;
1070
1071         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1072             const uint8_t * const d64 = dither_8x8_73[y & 7];
1073             const uint8_t * const d32 = dither_8x8_32[y & 7];
1074             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1075             db1 =       d64[(i * 2 + 0) & 7];
1076             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1077             db2 =       d64[(i * 2 + 1) & 7];
1078         } else {
1079             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1080             const uint8_t * const d128 = dither_8x8_220[y & 7];
1081             dr1 = db1 = d128[(i * 2 + 0) & 7];
1082             dg1 =        d64[(i * 2 + 0) & 7];
1083             dr2 = db2 = d128[(i * 2 + 1) & 7];
1084             dg2 =        d64[(i * 2 + 1) & 7];
1085         }
1086
1087         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1088             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1089                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1090         } else {
1091             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1092             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1093         }
1094     }
1095 }
1096
1097 static av_always_inline void
1098 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1099                      const int16_t **lumSrc, int lumFilterSize,
1100                      const int16_t *chrFilter, const int16_t **chrUSrc,
1101                      const int16_t **chrVSrc, int chrFilterSize,
1102                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1103                      int y, enum PixelFormat target, int hasAlpha)
1104 {
1105     int i;
1106
1107     for (i = 0; i < (dstW >> 1); i++) {
1108         int j;
1109         int Y1 = 1 << 18;
1110         int Y2 = 1 << 18;
1111         int U  = 1 << 18;
1112         int V  = 1 << 18;
1113         int av_unused A1, A2;
1114         const void *r, *g, *b;
1115
1116         for (j = 0; j < lumFilterSize; j++) {
1117             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1118             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1119         }
1120         for (j = 0; j < chrFilterSize; j++) {
1121             U += chrUSrc[j][i] * chrFilter[j];
1122             V += chrVSrc[j][i] * chrFilter[j];
1123         }
1124         Y1 >>= 19;
1125         Y2 >>= 19;
1126         U  >>= 19;
1127         V  >>= 19;
1128         if ((Y1 | Y2 | U | V) & 0x100) {
1129             Y1 = av_clip_uint8(Y1);
1130             Y2 = av_clip_uint8(Y2);
1131             U  = av_clip_uint8(U);
1132             V  = av_clip_uint8(V);
1133         }
1134         if (hasAlpha) {
1135             A1 = 1 << 18;
1136             A2 = 1 << 18;
1137             for (j = 0; j < lumFilterSize; j++) {
1138                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1139                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1140             }
1141             A1 >>= 19;
1142             A2 >>= 19;
1143             if ((A1 | A2) & 0x100) {
1144                 A1 = av_clip_uint8(A1);
1145                 A2 = av_clip_uint8(A2);
1146             }
1147         }
1148
1149         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1150         r =  c->table_rV[V];
1151         g = (c->table_gU[U] + c->table_gV[V]);
1152         b =  c->table_bU[U];
1153
1154         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1155                       r, g, b, y, target, hasAlpha);
1156     }
1157 }
1158
1159 static av_always_inline void
1160 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1161                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1162                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1163                      int yalpha, int uvalpha, int y,
1164                      enum PixelFormat target, int hasAlpha)
1165 {
1166     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1167                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1168                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1169                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1170                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1171     int  yalpha1 = 4095 - yalpha;
1172     int uvalpha1 = 4095 - uvalpha;
1173     int i;
1174
1175     for (i = 0; i < (dstW >> 1); i++) {
1176         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1177         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1178         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1179         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1180         int A1, A2;
1181         const void *r =  c->table_rV[V],
1182                    *g = (c->table_gU[U] + c->table_gV[V]),
1183                    *b =  c->table_bU[U];
1184
1185         if (hasAlpha) {
1186             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1187             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1188         }
1189
1190         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1191                       r, g, b, y, target, hasAlpha);
1192     }
1193 }
1194
1195 static av_always_inline void
1196 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1197                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1198                      const int16_t *abuf0, uint8_t *dest, int dstW,
1199                      int uvalpha, int y, enum PixelFormat target,
1200                      int hasAlpha)
1201 {
1202     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1203                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1204     int i;
1205
1206     if (uvalpha < 2048) {
1207         for (i = 0; i < (dstW >> 1); i++) {
1208             int Y1 = buf0[i * 2]     >> 7;
1209             int Y2 = buf0[i * 2 + 1] >> 7;
1210             int U  = ubuf1[i]        >> 7;
1211             int V  = vbuf1[i]        >> 7;
1212             int A1, A2;
1213             const void *r =  c->table_rV[V],
1214                        *g = (c->table_gU[U] + c->table_gV[V]),
1215                        *b =  c->table_bU[U];
1216
1217             if (hasAlpha) {
1218                 A1 = abuf0[i * 2    ] >> 7;
1219                 A2 = abuf0[i * 2 + 1] >> 7;
1220             }
1221
1222             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1223                           r, g, b, y, target, hasAlpha);
1224         }
1225     } else {
1226         for (i = 0; i < (dstW >> 1); i++) {
1227             int Y1 =  buf0[i * 2]          >> 7;
1228             int Y2 =  buf0[i * 2 + 1]      >> 7;
1229             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1230             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1231             int A1, A2;
1232             const void *r =  c->table_rV[V],
1233                        *g = (c->table_gU[U] + c->table_gV[V]),
1234                        *b =  c->table_bU[U];
1235
1236             if (hasAlpha) {
1237                 A1 = abuf0[i * 2    ] >> 7;
1238                 A2 = abuf0[i * 2 + 1] >> 7;
1239             }
1240
1241             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1242                           r, g, b, y, target, hasAlpha);
1243         }
1244     }
1245 }
1246
1247 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1248 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1249                                 const int16_t **lumSrc, int lumFilterSize, \
1250                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1251                                 const int16_t **chrVSrc, int chrFilterSize, \
1252                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1253                                 int y) \
1254 { \
1255     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1256                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1257                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1258 }
1259 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1260 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1261 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1262                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1263                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1264                                 int yalpha, int uvalpha, int y) \
1265 { \
1266     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1267                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1268 } \
1269  \
1270 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1271                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1272                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1273                                 int uvalpha, int y) \
1274 { \
1275     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1276                                   dstW, uvalpha, y, fmt, hasAlpha); \
1277 }
1278
1279 #if CONFIG_SMALL
1280 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1281 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1282 #else
1283 #if CONFIG_SWSCALE_ALPHA
1284 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1)
1285 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1)
1286 #endif
1287 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0)
1288 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0)
1289 #endif
1290 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0)
1291 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0)
1292 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0)
1293 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0)
1294 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0)
1295 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0)
1296 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0)
1297 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0)
1298
1299 static av_always_inline void
1300 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1301                           const int16_t **lumSrc, int lumFilterSize,
1302                           const int16_t *chrFilter, const int16_t **chrUSrc,
1303                           const int16_t **chrVSrc, int chrFilterSize,
1304                           const int16_t **alpSrc, uint8_t *dest,
1305                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1306 {
1307     int i;
1308     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1309
1310     for (i = 0; i < dstW; i++) {
1311         int j;
1312         int Y = 1<<9;
1313         int U = (1<<9)-(128 << 19);
1314         int V = (1<<9)-(128 << 19);
1315         int av_unused A;
1316         int R, G, B;
1317
1318         for (j = 0; j < lumFilterSize; j++) {
1319             Y += lumSrc[j][i] * lumFilter[j];
1320         }
1321         for (j = 0; j < chrFilterSize; j++) {
1322             U += chrUSrc[j][i] * chrFilter[j];
1323             V += chrVSrc[j][i] * chrFilter[j];
1324         }
1325         Y >>= 10;
1326         U >>= 10;
1327         V >>= 10;
1328         if (hasAlpha) {
1329             A = 1 << 18;
1330             for (j = 0; j < lumFilterSize; j++) {
1331                 A += alpSrc[j][i] * lumFilter[j];
1332             }
1333             A >>= 19;
1334             if (A & 0x100)
1335                 A = av_clip_uint8(A);
1336         }
1337         Y -= c->yuv2rgb_y_offset;
1338         Y *= c->yuv2rgb_y_coeff;
1339         Y += 1 << 21;
1340         R = Y + V*c->yuv2rgb_v2r_coeff;
1341         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1342         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1343         if ((R | G | B) & 0xC0000000) {
1344             R = av_clip_uintp2(R, 30);
1345             G = av_clip_uintp2(G, 30);
1346             B = av_clip_uintp2(B, 30);
1347         }
1348
1349         switch(target) {
1350         case PIX_FMT_ARGB:
1351             dest[0] = hasAlpha ? A : 255;
1352             dest[1] = R >> 22;
1353             dest[2] = G >> 22;
1354             dest[3] = B >> 22;
1355             break;
1356         case PIX_FMT_RGB24:
1357             dest[0] = R >> 22;
1358             dest[1] = G >> 22;
1359             dest[2] = B >> 22;
1360             break;
1361         case PIX_FMT_RGBA:
1362             dest[0] = R >> 22;
1363             dest[1] = G >> 22;
1364             dest[2] = B >> 22;
1365             dest[3] = hasAlpha ? A : 255;
1366             break;
1367         case PIX_FMT_ABGR:
1368             dest[0] = hasAlpha ? A : 255;
1369             dest[1] = B >> 22;
1370             dest[2] = G >> 22;
1371             dest[3] = R >> 22;
1372             break;
1373         case PIX_FMT_BGR24:
1374             dest[0] = B >> 22;
1375             dest[1] = G >> 22;
1376             dest[2] = R >> 22;
1377             break;
1378         case PIX_FMT_BGRA:
1379             dest[0] = B >> 22;
1380             dest[1] = G >> 22;
1381             dest[2] = R >> 22;
1382             dest[3] = hasAlpha ? A : 255;
1383             break;
1384         }
1385         dest += step;
1386     }
1387 }
1388
1389 #if CONFIG_SMALL
1390 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1391 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1392 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1393 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1394 #else
1395 #if CONFIG_SWSCALE_ALPHA
1396 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1)
1397 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1)
1398 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1)
1399 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1)
1400 #endif
1401 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0)
1402 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0)
1403 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0)
1404 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0)
1405 #endif
1406 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0)
1407 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0)
1408
1409 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1410                                        int width, int height,
1411                                        int y, uint8_t val)
1412 {
1413     int i;
1414     uint8_t *ptr = plane + stride*y;
1415     for (i=0; i<height; i++) {
1416         memset(ptr, val, width);
1417         ptr += stride;
1418     }
1419 }
1420
1421 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1422
1423 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1424 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1425
1426 static av_always_inline void
1427 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1428                     enum PixelFormat origin)
1429 {
1430     int i;
1431     for (i = 0; i < width; i++) {
1432         unsigned int r_b = input_pixel(&src[i*3+0]);
1433         unsigned int   g = input_pixel(&src[i*3+1]);
1434         unsigned int b_r = input_pixel(&src[i*3+2]);
1435
1436         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1437     }
1438 }
1439
1440 static av_always_inline void
1441 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1442                     const uint16_t *src1, const uint16_t *src2,
1443                     int width, enum PixelFormat origin)
1444 {
1445     int i;
1446     assert(src1==src2);
1447     for (i = 0; i < width; i++) {
1448         int r_b = input_pixel(&src1[i*3+0]);
1449         int   g = input_pixel(&src1[i*3+1]);
1450         int b_r = input_pixel(&src1[i*3+2]);
1451
1452         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1453         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1454     }
1455 }
1456
1457 static av_always_inline void
1458 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1459                           const uint16_t *src1, const uint16_t *src2,
1460                           int width, enum PixelFormat origin)
1461 {
1462     int i;
1463     assert(src1==src2);
1464     for (i = 0; i < width; i++) {
1465         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1466         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1467         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1468
1469         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1470         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1471     }
1472 }
1473
1474 #undef r
1475 #undef b
1476 #undef input_pixel
1477
1478 #define rgb48funcs(pattern, BE_LE, origin) \
1479 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
1480                                     int width, uint32_t *unused) \
1481 { \
1482     const uint16_t *src = (const uint16_t *) _src; \
1483     uint16_t *dst = (uint16_t *) _dst; \
1484     rgb48ToY_c_template(dst, src, width, origin); \
1485 } \
1486  \
1487 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1488                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1489                                     int width, uint32_t *unused) \
1490 { \
1491     const uint16_t *src1 = (const uint16_t *) _src1, \
1492                    *src2 = (const uint16_t *) _src2; \
1493     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1494     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1495 } \
1496  \
1497 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1498                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1499                                     int width, uint32_t *unused) \
1500 { \
1501     const uint16_t *src1 = (const uint16_t *) _src1, \
1502                    *src2 = (const uint16_t *) _src2; \
1503     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1504     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1505 }
1506
1507 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE)
1508 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE)
1509 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE)
1510 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE)
1511
1512 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1513                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1514                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1515
1516 static av_always_inline void
1517 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1518                        int width, enum PixelFormat origin,
1519                        int shr,   int shg,   int shb, int shp,
1520                        int maskr, int maskg, int maskb,
1521                        int rsh,   int gsh,   int bsh, int S)
1522 {
1523     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh;
1524     const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
1525     int i;
1526
1527     for (i = 0; i < width; i++) {
1528         int px = input_pixel(i) >> shp;
1529         int b = (px & maskb) >> shb;
1530         int g = (px & maskg) >> shg;
1531         int r = (px & maskr) >> shr;
1532
1533         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1534     }
1535 }
1536
1537 static av_always_inline void
1538 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1539                         const uint8_t *src, int width,
1540                         enum PixelFormat origin,
1541                         int shr,   int shg,   int shb, int shp,
1542                         int maskr, int maskg, int maskb,
1543                         int rsh,   int gsh,   int bsh, int S)
1544 {
1545     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1546               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh;
1547     const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
1548     int i;
1549
1550     for (i = 0; i < width; i++) {
1551         int px = input_pixel(i) >> shp;
1552         int b = (px & maskb) >> shb;
1553         int g = (px & maskg) >> shg;
1554         int r = (px & maskr) >> shr;
1555
1556         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1557         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1558     }
1559 }
1560
1561 static av_always_inline void
1562 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1563                              const uint8_t *src, int width,
1564                              enum PixelFormat origin,
1565                              int shr,   int shg,   int shb, int shp,
1566                              int maskr, int maskg, int maskb,
1567                              int rsh,   int gsh,   int bsh, int S)
1568 {
1569     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1570               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1571               maskgx = ~(maskr | maskb);
1572     const unsigned rnd = (256U<<(S)) + (1<<(S-6));
1573     int i;
1574
1575     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1576     for (i = 0; i < width; i++) {
1577         int px0 = input_pixel(2 * i + 0) >> shp;
1578         int px1 = input_pixel(2 * i + 1) >> shp;
1579         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1580         int rb = px0 + px1 - g;
1581
1582         b = (rb & maskb) >> shb;
1583         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1584             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1585             g >>= shg;
1586         } else {
1587             g = (g  & maskg) >> shg;
1588         }
1589         r = (rb & maskr) >> shr;
1590
1591         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1592         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1593     }
1594 }
1595
1596 #undef input_pixel
1597
1598 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1599                          maskg, maskb, rsh, gsh, bsh, S) \
1600 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
1601                           int width, uint32_t *unused) \
1602 { \
1603     rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, \
1604                            shr, shg, shb, shp, \
1605                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1606 } \
1607  \
1608 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1609                            const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1610                            int width, uint32_t *unused) \
1611 { \
1612     rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,  \
1613                             shr, shg, shb, shp, \
1614                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1615 } \
1616  \
1617 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1618                                 const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1619                                 int width, uint32_t *unused) \
1620 { \
1621     rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
1622                                  shr, shg, shb, shp, \
1623                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1624 }
1625
1626 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
1627 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
1628 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
1629 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
1630 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
1631 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
1632 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
1633 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
1634 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
1635 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
1636 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
1637 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
1638
1639 static void gbr24pToUV_half_c(uint16_t *dstU, uint16_t *dstV,
1640                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
1641                          int width, enum PixelFormat origin)
1642 {
1643     int i;
1644     for (i = 0; i < width; i++) {
1645         unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
1646         unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
1647         unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
1648
1649         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1650         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1651     }
1652 }
1653
1654 static void abgrToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1655 {
1656     int i;
1657     for (i=0; i<width; i++) {
1658         dst[i]= src[4*i]<<6;
1659     }
1660 }
1661
1662 static void rgbaToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1663 {
1664     int i;
1665     for (i=0; i<width; i++) {
1666         dst[i]= src[4*i+3]<<6;
1667     }
1668 }
1669
1670 static void palToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
1671 {
1672     int i;
1673     for (i=0; i<width; i++) {
1674         int d= src[i];
1675
1676         dst[i]= (pal[d] >> 24)<<6;
1677     }
1678 }
1679
1680 static void palToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, long width, uint32_t *pal)
1681 {
1682     int i;
1683     for (i=0; i<width; i++) {
1684         int d= src[i];
1685
1686         dst[i]= (pal[d] & 0xFF)<<6;
1687     }
1688 }
1689
1690 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1691                            const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1692                            int width, uint32_t *pal)
1693 {
1694     int i;
1695     assert(src1 == src2);
1696     for (i=0; i<width; i++) {
1697         int p= pal[src1[i]];
1698
1699         dstU[i]= (uint8_t)(p>> 8)<<6;
1700         dstV[i]= (uint8_t)(p>>16)<<6;
1701     }
1702 }
1703
1704 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1705 {
1706     int i, j;
1707     for (i=0; i<width/8; i++) {
1708         int d= ~src[i];
1709         for(j=0; j<8; j++)
1710             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1711     }
1712     if(width&7){
1713         int d= ~src[i];
1714         for(j=0; j<(width&7); j++)
1715             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1716     }
1717 }
1718
1719 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1720 {
1721     int i, j;
1722     for (i=0; i<width/8; i++) {
1723         int d= src[i];
1724         for(j=0; j<8; j++)
1725             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1726     }
1727     if(width&7){
1728         int d= src[i];
1729         for(j=0; j<(width&7); j++)
1730             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1731     }
1732 }
1733
1734 //FIXME yuy2* can read up to 7 samples too much
1735
1736 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1737                       uint32_t *unused)
1738 {
1739     int i;
1740     for (i=0; i<width; i++)
1741         dst[i]= src[2*i];
1742 }
1743
1744 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1745                        const uint8_t *src2, int width, uint32_t *unused)
1746 {
1747     int i;
1748     for (i=0; i<width; i++) {
1749         dstU[i]= src1[4*i + 1];
1750         dstV[i]= src1[4*i + 3];
1751     }
1752     assert(src1 == src2);
1753 }
1754
1755 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1756 {
1757     int i;
1758     const uint16_t *src = (const uint16_t *) _src;
1759     uint16_t *dst = (uint16_t *) _dst;
1760     for (i=0; i<width; i++) {
1761         dst[i] = av_bswap16(src[i]);
1762     }
1763 }
1764
1765 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
1766                         const uint8_t *_src2, int width, uint32_t *unused)
1767 {
1768     int i;
1769     const uint16_t *src1 = (const uint16_t *) _src1,
1770                    *src2 = (const uint16_t *) _src2;
1771     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1772     for (i=0; i<width; i++) {
1773         dstU[i] = av_bswap16(src1[i]);
1774         dstV[i] = av_bswap16(src2[i]);
1775     }
1776 }
1777
1778 /* This is almost identical to the previous, end exists only because
1779  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1780 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1781                       uint32_t *unused)
1782 {
1783     int i;
1784     for (i=0; i<width; i++)
1785         dst[i]= src[2*i+1];
1786 }
1787
1788 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1789                        const uint8_t *src2, int width, uint32_t *unused)
1790 {
1791     int i;
1792     for (i=0; i<width; i++) {
1793         dstU[i]= src1[4*i + 0];
1794         dstV[i]= src1[4*i + 2];
1795     }
1796     assert(src1 == src2);
1797 }
1798
1799 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1800                                         const uint8_t *src, int width)
1801 {
1802     int i;
1803     for (i = 0; i < width; i++) {
1804         dst1[i] = src[2*i+0];
1805         dst2[i] = src[2*i+1];
1806     }
1807 }
1808
1809 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1810                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1811                        int width, uint32_t *unused)
1812 {
1813     nvXXtoUV_c(dstU, dstV, src1, width);
1814 }
1815
1816 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1817                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1818                        int width, uint32_t *unused)
1819 {
1820     nvXXtoUV_c(dstV, dstU, src1, width);
1821 }
1822
1823 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1824
1825 static void bgr24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
1826                        int width, uint32_t *unused)
1827 {
1828     int i;
1829     for (i=0; i<width; i++) {
1830         int b= src[i*3+0];
1831         int g= src[i*3+1];
1832         int r= src[i*3+2];
1833
1834         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1835     }
1836 }
1837
1838 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1839                         const uint8_t *src2, int width, uint32_t *unused)
1840 {
1841     int i;
1842     for (i=0; i<width; i++) {
1843         int b= src1[3*i + 0];
1844         int g= src1[3*i + 1];
1845         int r= src1[3*i + 2];
1846
1847         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1848         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1849     }
1850     assert(src1 == src2);
1851 }
1852
1853 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1854                              const uint8_t *src2, int width, uint32_t *unused)
1855 {
1856     int i;
1857     for (i=0; i<width; i++) {
1858         int b= src1[6*i + 0] + src1[6*i + 3];
1859         int g= src1[6*i + 1] + src1[6*i + 4];
1860         int r= src1[6*i + 2] + src1[6*i + 5];
1861
1862         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1863         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1864     }
1865     assert(src1 == src2);
1866 }
1867
1868 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
1869                        uint32_t *unused)
1870 {
1871     int i;
1872     for (i=0; i<width; i++) {
1873         int r= src[i*3+0];
1874         int g= src[i*3+1];
1875         int b= src[i*3+2];
1876
1877         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1878     }
1879 }
1880
1881 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1882                         const uint8_t *src2, int width, uint32_t *unused)
1883 {
1884     int i;
1885     assert(src1==src2);
1886     for (i=0; i<width; i++) {
1887         int r= src1[3*i + 0];
1888         int g= src1[3*i + 1];
1889         int b= src1[3*i + 2];
1890
1891         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1892         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1893     }
1894 }
1895
1896 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1897                                     const uint8_t *src2, int width, uint32_t *unused)
1898 {
1899     int i;
1900     assert(src1==src2);
1901     for (i=0; i<width; i++) {
1902         int r= src1[6*i + 0] + src1[6*i + 3];
1903         int g= src1[6*i + 1] + src1[6*i + 4];
1904         int b= src1[6*i + 2] + src1[6*i + 5];
1905
1906         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1907         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1908     }
1909 }
1910
1911 static void planar_rgb_to_y(uint16_t *dst, const uint8_t *src[4], int width)
1912 {
1913     int i;
1914     for (i = 0; i < width; i++) {
1915         int g = src[0][i];
1916         int b = src[1][i];
1917         int r = src[2][i];
1918
1919         dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1920     }
1921 }
1922
1923 static void planar_rgb16le_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1924 {
1925     int i;
1926     const uint16_t **src = (const uint16_t **) _src;
1927     uint16_t *dst = (uint16_t *) _dst;
1928     for (i = 0; i < width; i++) {
1929         int g = AV_RL16(src[0] + i);
1930         int b = AV_RL16(src[1] + i);
1931         int r = AV_RL16(src[2] + i);
1932
1933         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1934     }
1935 }
1936
1937 static void planar_rgb16be_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1938 {
1939     int i;
1940     const uint16_t **src = (const uint16_t **) _src;
1941     uint16_t *dst = (uint16_t *) _dst;
1942     for (i = 0; i < width; i++) {
1943         int g = AV_RB16(src[0] + i);
1944         int b = AV_RB16(src[1] + i);
1945         int r = AV_RB16(src[2] + i);
1946
1947         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1948     }
1949 }
1950
1951 static void planar_rgb_to_uv(uint16_t *dstU, uint16_t *dstV, const uint8_t *src[4], int width)
1952 {
1953     int i;
1954     for (i = 0; i < width; i++) {
1955         int g = src[0][i];
1956         int b = src[1][i];
1957         int r = src[2][i];
1958
1959         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1960         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1961     }
1962 }
1963
1964 static void planar_rgb16le_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
1965 {
1966     int i;
1967     const uint16_t **src = (const uint16_t **) _src;
1968     uint16_t *dstU = (uint16_t *) _dstU;
1969     uint16_t *dstV = (uint16_t *) _dstV;
1970     for (i = 0; i < width; i++) {
1971         int g = AV_RL16(src[0] + i);
1972         int b = AV_RL16(src[1] + i);
1973         int r = AV_RL16(src[2] + i);
1974
1975         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1976         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1977     }
1978 }
1979
1980 static void planar_rgb16be_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
1981 {
1982     int i;
1983     const uint16_t **src = (const uint16_t **) _src;
1984     uint16_t *dstU = (uint16_t *) _dstU;
1985     uint16_t *dstV = (uint16_t *) _dstV;
1986     for (i = 0; i < width; i++) {
1987         int g = AV_RB16(src[0] + i);
1988         int b = AV_RB16(src[1] + i);
1989         int r = AV_RB16(src[2] + i);
1990
1991         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1992         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1993     }
1994 }
1995
1996 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
1997                            const int16_t *filter,
1998                            const int16_t *filterPos, int filterSize)
1999 {
2000     int i;
2001     int32_t *dst = (int32_t *) _dst;
2002     const uint16_t *src = (const uint16_t *) _src;
2003     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2004     int sh = bits - 4;
2005
2006     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2007         sh= 9;
2008
2009     for (i = 0; i < dstW; i++) {
2010         int j;
2011         int srcPos = filterPos[i];
2012         int val = 0;
2013
2014         for (j = 0; j < filterSize; j++) {
2015             val += src[srcPos + j] * filter[filterSize * i + j];
2016         }
2017         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
2018         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
2019     }
2020 }
2021
2022 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
2023                            const int16_t *filter,
2024                            const int16_t *filterPos, int filterSize)
2025 {
2026     int i;
2027     const uint16_t *src = (const uint16_t *) _src;
2028     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2029
2030     if(sh<15)
2031         sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2032
2033     for (i = 0; i < dstW; i++) {
2034         int j;
2035         int srcPos = filterPos[i];
2036         int val = 0;
2037
2038         for (j = 0; j < filterSize; j++) {
2039             val += src[srcPos + j] * filter[filterSize * i + j];
2040         }
2041         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2042         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2043     }
2044 }
2045
2046 // bilinear / bicubic scaling
2047 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2048                           const int16_t *filter, const int16_t *filterPos,
2049                           int filterSize)
2050 {
2051     int i;
2052     for (i=0; i<dstW; i++) {
2053         int j;
2054         int srcPos= filterPos[i];
2055         int val=0;
2056         for (j=0; j<filterSize; j++) {
2057             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2058         }
2059         //filter += hFilterSize;
2060         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2061         //dst[i] = val>>7;
2062     }
2063 }
2064
2065 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
2066                           const int16_t *filter, const int16_t *filterPos,
2067                           int filterSize)
2068 {
2069     int i;
2070     int32_t *dst = (int32_t *) _dst;
2071     for (i=0; i<dstW; i++) {
2072         int j;
2073         int srcPos= filterPos[i];
2074         int val=0;
2075         for (j=0; j<filterSize; j++) {
2076             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2077         }
2078         //filter += hFilterSize;
2079         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
2080         //dst[i] = val>>7;
2081     }
2082 }
2083
2084 //FIXME all pal and rgb srcFormats could do this convertion as well
2085 //FIXME all scalers more complex than bilinear could do half of this transform
2086 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2087 {
2088     int i;
2089     for (i = 0; i < width; i++) {
2090         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2091         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2092     }
2093 }
2094 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2095 {
2096     int i;
2097     for (i = 0; i < width; i++) {
2098         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2099         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2100     }
2101 }
2102 static void lumRangeToJpeg_c(int16_t *dst, int width)
2103 {
2104     int i;
2105     for (i = 0; i < width; i++)
2106         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2107 }
2108 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2109 {
2110     int i;
2111     for (i = 0; i < width; i++)
2112         dst[i] = (dst[i]*14071 + 33561947)>>14;
2113 }
2114
2115 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2116 {
2117     int i;
2118     int32_t *dstU = (int32_t *) _dstU;
2119     int32_t *dstV = (int32_t *) _dstV;
2120     for (i = 0; i < width; i++) {
2121         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2122         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2123     }
2124 }
2125 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2126 {
2127     int i;
2128     int32_t *dstU = (int32_t *) _dstU;
2129     int32_t *dstV = (int32_t *) _dstV;
2130     for (i = 0; i < width; i++) {
2131         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2132         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2133     }
2134 }
2135 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2136 {
2137     int i;
2138     int32_t *dst = (int32_t *) _dst;
2139     for (i = 0; i < width; i++)
2140         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2141 }
2142 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2143 {
2144     int i;
2145     int32_t *dst = (int32_t *) _dst;
2146     for (i = 0; i < width; i++)
2147         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2148 }
2149
2150 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2151                            const uint8_t *src, int srcW, int xInc)
2152 {
2153     int i;
2154     unsigned int xpos=0;
2155     for (i=0;i<dstWidth;i++) {
2156         register unsigned int xx=xpos>>16;
2157         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2158         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2159         xpos+=xInc;
2160     }
2161     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2162         dst[i] = src[srcW-1]*128;
2163 }
2164
2165 // *** horizontal scale Y line to temp buffer
2166 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2167                                      const uint8_t *src_in[4], int srcW, int xInc,
2168                                      const int16_t *hLumFilter,
2169                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2170                                      uint8_t *formatConvBuffer,
2171                                      uint32_t *pal, int isAlpha)
2172 {
2173     void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2174     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2175     const uint8_t *src = src_in[isAlpha ? 3 : 0];
2176
2177     if (toYV12) {
2178         toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
2179         src= formatConvBuffer;
2180     } else if (c->readLumPlanar && !isAlpha) {
2181         c->readLumPlanar(formatConvBuffer, src_in, srcW);
2182         src = formatConvBuffer;
2183     }
2184
2185     if (!c->hyscale_fast) {
2186         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2187     } else { // fast bilinear upscale / crap downscale
2188         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2189     }
2190
2191     if (convertRange)
2192         convertRange(dst, dstWidth);
2193 }
2194
2195 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2196                            int dstWidth, const uint8_t *src1,
2197                            const uint8_t *src2, int srcW, int xInc)
2198 {
2199     int i;
2200     unsigned int xpos=0;
2201     for (i=0;i<dstWidth;i++) {
2202         register unsigned int xx=xpos>>16;
2203         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2204         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2205         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2206         xpos+=xInc;
2207     }
2208     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2209         dst1[i] = src1[srcW-1]*128;
2210         dst2[i] = src2[srcW-1]*128;
2211     }
2212 }
2213
2214 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2215                                      const uint8_t *src_in[4],
2216                                      int srcW, int xInc, const int16_t *hChrFilter,
2217                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2218                                      uint8_t *formatConvBuffer, uint32_t *pal)
2219 {
2220     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
2221     if (c->chrToYV12) {
2222         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2223         c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
2224         src1= formatConvBuffer;
2225         src2= buf2;
2226     } else if (c->readChrPlanar) {
2227         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2228         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
2229         src1= formatConvBuffer;
2230         src2= buf2;
2231     }
2232
2233     if (!c->hcscale_fast) {
2234         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2235         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2236     } else { // fast bilinear upscale / crap downscale
2237         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2238     }
2239
2240     if (c->chrConvertRange)
2241         c->chrConvertRange(dst1, dst2, dstWidth);
2242 }
2243
2244 static av_always_inline void
2245 find_c_packed_planar_out_funcs(SwsContext *c,
2246                                yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX,
2247                                yuv2interleavedX_fn *yuv2nv12cX,
2248                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2249                                yuv2packedX_fn *yuv2packedX)
2250 {
2251     enum PixelFormat dstFormat = c->dstFormat;
2252
2253     if (is16BPS(dstFormat)) {
2254         *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c  : yuv2planeX_16LE_c;
2255         *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c  : yuv2plane1_16LE_c;
2256     } else if (is9_OR_10BPS(dstFormat)) {
2257         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2258             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c  : yuv2planeX_9LE_c;
2259             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c  : yuv2plane1_9LE_c;
2260         } else {
2261             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c  : yuv2planeX_10LE_c;
2262             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c  : yuv2plane1_10LE_c;
2263         }
2264     } else {
2265         *yuv2plane1 = yuv2plane1_8_c;
2266         *yuv2planeX = yuv2planeX_8_c;
2267         if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)
2268             *yuv2nv12cX = yuv2nv12cX_c;
2269     }
2270
2271     if(c->flags & SWS_FULL_CHR_H_INT) {
2272         switch (dstFormat) {
2273             case PIX_FMT_RGBA:
2274 #if CONFIG_SMALL
2275                 *yuv2packedX = yuv2rgba32_full_X_c;
2276 #else
2277 #if CONFIG_SWSCALE_ALPHA
2278                 if (c->alpPixBuf) {
2279                     *yuv2packedX = yuv2rgba32_full_X_c;
2280                 } else
2281 #endif /* CONFIG_SWSCALE_ALPHA */
2282                 {
2283                     *yuv2packedX = yuv2rgbx32_full_X_c;
2284                 }
2285 #endif /* !CONFIG_SMALL */
2286                 break;
2287             case PIX_FMT_ARGB:
2288 #if CONFIG_SMALL
2289                 *yuv2packedX = yuv2argb32_full_X_c;
2290 #else
2291 #if CONFIG_SWSCALE_ALPHA
2292                 if (c->alpPixBuf) {
2293                     *yuv2packedX = yuv2argb32_full_X_c;
2294                 } else
2295 #endif /* CONFIG_SWSCALE_ALPHA */
2296                 {
2297                     *yuv2packedX = yuv2xrgb32_full_X_c;
2298                 }
2299 #endif /* !CONFIG_SMALL */
2300                 break;
2301             case PIX_FMT_BGRA:
2302 #if CONFIG_SMALL
2303                 *yuv2packedX = yuv2bgra32_full_X_c;
2304 #else
2305 #if CONFIG_SWSCALE_ALPHA
2306                 if (c->alpPixBuf) {
2307                     *yuv2packedX = yuv2bgra32_full_X_c;
2308                 } else
2309 #endif /* CONFIG_SWSCALE_ALPHA */
2310                 {
2311                     *yuv2packedX = yuv2bgrx32_full_X_c;
2312                 }
2313 #endif /* !CONFIG_SMALL */
2314                 break;
2315             case PIX_FMT_ABGR:
2316 #if CONFIG_SMALL
2317                 *yuv2packedX = yuv2abgr32_full_X_c;
2318 #else
2319 #if CONFIG_SWSCALE_ALPHA
2320                 if (c->alpPixBuf) {
2321                     *yuv2packedX = yuv2abgr32_full_X_c;
2322                 } else
2323 #endif /* CONFIG_SWSCALE_ALPHA */
2324                 {
2325                     *yuv2packedX = yuv2xbgr32_full_X_c;
2326                 }
2327 #endif /* !CONFIG_SMALL */
2328                 break;
2329             case PIX_FMT_RGB24:
2330             *yuv2packedX = yuv2rgb24_full_X_c;
2331             break;
2332         case PIX_FMT_BGR24:
2333             *yuv2packedX = yuv2bgr24_full_X_c;
2334             break;
2335         }
2336         if(!*yuv2packedX)
2337             goto YUV_PACKED;
2338     } else {
2339         YUV_PACKED:
2340         switch (dstFormat) {
2341         case PIX_FMT_RGB48LE:
2342             *yuv2packed1 = yuv2rgb48le_1_c;
2343             *yuv2packed2 = yuv2rgb48le_2_c;
2344             *yuv2packedX = yuv2rgb48le_X_c;
2345             break;
2346         case PIX_FMT_RGB48BE:
2347             *yuv2packed1 = yuv2rgb48be_1_c;
2348             *yuv2packed2 = yuv2rgb48be_2_c;
2349             *yuv2packedX = yuv2rgb48be_X_c;
2350             break;
2351         case PIX_FMT_BGR48LE:
2352             *yuv2packed1 = yuv2bgr48le_1_c;
2353             *yuv2packed2 = yuv2bgr48le_2_c;
2354             *yuv2packedX = yuv2bgr48le_X_c;
2355             break;
2356         case PIX_FMT_BGR48BE:
2357             *yuv2packed1 = yuv2bgr48be_1_c;
2358             *yuv2packed2 = yuv2bgr48be_2_c;
2359             *yuv2packedX = yuv2bgr48be_X_c;
2360             break;
2361         case PIX_FMT_RGB32:
2362         case PIX_FMT_BGR32:
2363 #if CONFIG_SMALL
2364             *yuv2packed1 = yuv2rgb32_1_c;
2365             *yuv2packed2 = yuv2rgb32_2_c;
2366             *yuv2packedX = yuv2rgb32_X_c;
2367 #else
2368 #if CONFIG_SWSCALE_ALPHA
2369                 if (c->alpPixBuf) {
2370                     *yuv2packed1 = yuv2rgba32_1_c;
2371                     *yuv2packed2 = yuv2rgba32_2_c;
2372                     *yuv2packedX = yuv2rgba32_X_c;
2373                 } else
2374 #endif /* CONFIG_SWSCALE_ALPHA */
2375                 {
2376                     *yuv2packed1 = yuv2rgbx32_1_c;
2377                     *yuv2packed2 = yuv2rgbx32_2_c;
2378                     *yuv2packedX = yuv2rgbx32_X_c;
2379                 }
2380 #endif /* !CONFIG_SMALL */
2381             break;
2382         case PIX_FMT_RGB32_1:
2383         case PIX_FMT_BGR32_1:
2384 #if CONFIG_SMALL
2385                 *yuv2packed1 = yuv2rgb32_1_1_c;
2386                 *yuv2packed2 = yuv2rgb32_1_2_c;
2387                 *yuv2packedX = yuv2rgb32_1_X_c;
2388 #else
2389 #if CONFIG_SWSCALE_ALPHA
2390                 if (c->alpPixBuf) {
2391                     *yuv2packed1 = yuv2rgba32_1_1_c;
2392                     *yuv2packed2 = yuv2rgba32_1_2_c;
2393                     *yuv2packedX = yuv2rgba32_1_X_c;
2394                 } else
2395 #endif /* CONFIG_SWSCALE_ALPHA */
2396                 {
2397                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2398                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2399                     *yuv2packedX = yuv2rgbx32_1_X_c;
2400                 }
2401 #endif /* !CONFIG_SMALL */
2402                 break;
2403         case PIX_FMT_RGB24:
2404             *yuv2packed1 = yuv2rgb24_1_c;
2405             *yuv2packed2 = yuv2rgb24_2_c;
2406             *yuv2packedX = yuv2rgb24_X_c;
2407             break;
2408         case PIX_FMT_BGR24:
2409             *yuv2packed1 = yuv2bgr24_1_c;
2410             *yuv2packed2 = yuv2bgr24_2_c;
2411             *yuv2packedX = yuv2bgr24_X_c;
2412             break;
2413         case PIX_FMT_RGB565LE:
2414         case PIX_FMT_RGB565BE:
2415         case PIX_FMT_BGR565LE:
2416         case PIX_FMT_BGR565BE:
2417             *yuv2packed1 = yuv2rgb16_1_c;
2418             *yuv2packed2 = yuv2rgb16_2_c;
2419             *yuv2packedX = yuv2rgb16_X_c;
2420             break;
2421         case PIX_FMT_RGB555LE:
2422         case PIX_FMT_RGB555BE:
2423         case PIX_FMT_BGR555LE:
2424         case PIX_FMT_BGR555BE:
2425             *yuv2packed1 = yuv2rgb15_1_c;
2426             *yuv2packed2 = yuv2rgb15_2_c;
2427             *yuv2packedX = yuv2rgb15_X_c;
2428             break;
2429         case PIX_FMT_RGB444LE:
2430         case PIX_FMT_RGB444BE:
2431         case PIX_FMT_BGR444LE:
2432         case PIX_FMT_BGR444BE:
2433             *yuv2packed1 = yuv2rgb12_1_c;
2434             *yuv2packed2 = yuv2rgb12_2_c;
2435             *yuv2packedX = yuv2rgb12_X_c;
2436             break;
2437         case PIX_FMT_RGB8:
2438         case PIX_FMT_BGR8:
2439             *yuv2packed1 = yuv2rgb8_1_c;
2440             *yuv2packed2 = yuv2rgb8_2_c;
2441             *yuv2packedX = yuv2rgb8_X_c;
2442             break;
2443         case PIX_FMT_RGB4:
2444         case PIX_FMT_BGR4:
2445             *yuv2packed1 = yuv2rgb4_1_c;
2446             *yuv2packed2 = yuv2rgb4_2_c;
2447             *yuv2packedX = yuv2rgb4_X_c;
2448             break;
2449         case PIX_FMT_RGB4_BYTE:
2450         case PIX_FMT_BGR4_BYTE:
2451             *yuv2packed1 = yuv2rgb4b_1_c;
2452             *yuv2packed2 = yuv2rgb4b_2_c;
2453             *yuv2packedX = yuv2rgb4b_X_c;
2454             break;
2455         }
2456     }
2457     switch (dstFormat) {
2458     case PIX_FMT_GRAY16BE:
2459         *yuv2packed1 = yuv2gray16BE_1_c;
2460         *yuv2packed2 = yuv2gray16BE_2_c;
2461         *yuv2packedX = yuv2gray16BE_X_c;
2462         break;
2463     case PIX_FMT_GRAY16LE:
2464         *yuv2packed1 = yuv2gray16LE_1_c;
2465         *yuv2packed2 = yuv2gray16LE_2_c;
2466         *yuv2packedX = yuv2gray16LE_X_c;
2467         break;
2468     case PIX_FMT_MONOWHITE:
2469         *yuv2packed1 = yuv2monowhite_1_c;
2470         *yuv2packed2 = yuv2monowhite_2_c;
2471         *yuv2packedX = yuv2monowhite_X_c;
2472         break;
2473     case PIX_FMT_MONOBLACK:
2474         *yuv2packed1 = yuv2monoblack_1_c;
2475         *yuv2packed2 = yuv2monoblack_2_c;
2476         *yuv2packedX = yuv2monoblack_X_c;
2477         break;
2478     case PIX_FMT_YUYV422:
2479         *yuv2packed1 = yuv2yuyv422_1_c;
2480         *yuv2packed2 = yuv2yuyv422_2_c;
2481         *yuv2packedX = yuv2yuyv422_X_c;
2482         break;
2483     case PIX_FMT_UYVY422:
2484         *yuv2packed1 = yuv2uyvy422_1_c;
2485         *yuv2packed2 = yuv2uyvy422_2_c;
2486         *yuv2packedX = yuv2uyvy422_X_c;
2487         break;
2488     }
2489 }
2490
2491 #define DEBUG_SWSCALE_BUFFERS 0
2492 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2493
2494 static int swScale(SwsContext *c, const uint8_t* src[],
2495                    int srcStride[], int srcSliceY,
2496                    int srcSliceH, uint8_t* dst[], int dstStride[])
2497 {
2498     /* load a few things into local vars to make the code more readable? and faster */
2499     const int srcW= c->srcW;
2500     const int dstW= c->dstW;
2501     const int dstH= c->dstH;
2502     const int chrDstW= c->chrDstW;
2503     const int chrSrcW= c->chrSrcW;
2504     const int lumXInc= c->lumXInc;
2505     const int chrXInc= c->chrXInc;
2506     const enum PixelFormat dstFormat= c->dstFormat;
2507     const int flags= c->flags;
2508     int16_t *vLumFilterPos= c->vLumFilterPos;
2509     int16_t *vChrFilterPos= c->vChrFilterPos;
2510     int16_t *hLumFilterPos= c->hLumFilterPos;
2511     int16_t *hChrFilterPos= c->hChrFilterPos;
2512     int16_t *hLumFilter= c->hLumFilter;
2513     int16_t *hChrFilter= c->hChrFilter;
2514     int32_t *lumMmxFilter= c->lumMmxFilter;
2515     int32_t *chrMmxFilter= c->chrMmxFilter;
2516     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2517     const int vLumFilterSize= c->vLumFilterSize;
2518     const int vChrFilterSize= c->vChrFilterSize;
2519     const int hLumFilterSize= c->hLumFilterSize;
2520     const int hChrFilterSize= c->hChrFilterSize;
2521     int16_t **lumPixBuf= c->lumPixBuf;
2522     int16_t **chrUPixBuf= c->chrUPixBuf;
2523     int16_t **chrVPixBuf= c->chrVPixBuf;
2524     int16_t **alpPixBuf= c->alpPixBuf;
2525     const int vLumBufSize= c->vLumBufSize;
2526     const int vChrBufSize= c->vChrBufSize;
2527     uint8_t *formatConvBuffer= c->formatConvBuffer;
2528     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2529     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2530     int lastDstY;
2531     uint32_t *pal=c->pal_yuv;
2532     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2533
2534     yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
2535     yuv2planarX_fn yuv2planeX = c->yuv2planeX;
2536     yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
2537     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2538     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2539     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2540
2541     /* vars which will change and which we need to store back in the context */
2542     int dstY= c->dstY;
2543     int lumBufIndex= c->lumBufIndex;
2544     int chrBufIndex= c->chrBufIndex;
2545     int lastInLumBuf= c->lastInLumBuf;
2546     int lastInChrBuf= c->lastInChrBuf;
2547
2548     if (isPacked(c->srcFormat)) {
2549         src[0]=
2550         src[1]=
2551         src[2]=
2552         src[3]= src[0];
2553         srcStride[0]=
2554         srcStride[1]=
2555         srcStride[2]=
2556         srcStride[3]= srcStride[0];
2557     }
2558     srcStride[1]<<= c->vChrDrop;
2559     srcStride[2]<<= c->vChrDrop;
2560
2561     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2562                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2563                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2564     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2565                    srcSliceY,    srcSliceH,    dstY,    dstH);
2566     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2567                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2568
2569     if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 || dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
2570         static int warnedAlready=0; //FIXME move this into the context perhaps
2571         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2572             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2573                    "         ->cannot do aligned memory accesses anymore\n");
2574             warnedAlready=1;
2575         }
2576     }
2577
2578     if ((int)dst[0]%16 || (int)dst[1]%16 || (int)dst[2]%16 || (int)src[0]%16 || (int)src[1]%16 || (int)src[2]%16
2579         || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
2580         || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
2581     ) {
2582         static int warnedAlready=0;
2583         int cpu_flags = av_get_cpu_flags();
2584         if (HAVE_MMX2 && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
2585             av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
2586             warnedAlready=1;
2587         }
2588     }
2589
2590     /* Note the user might start scaling the picture in the middle so this
2591        will not get executed. This is not really intended but works
2592        currently, so people might do it. */
2593     if (srcSliceY ==0) {
2594         lumBufIndex=-1;
2595         chrBufIndex=-1;
2596         dstY=0;
2597         lastInLumBuf= -1;
2598         lastInChrBuf= -1;
2599     }
2600
2601     if (!should_dither) {
2602         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2603     }
2604     lastDstY= dstY;
2605
2606     for (;dstY < dstH; dstY++) {
2607         const int chrDstY= dstY>>c->chrDstVSubSample;
2608         uint8_t *dest[4] = {
2609             dst[0] + dstStride[0] * dstY,
2610             dst[1] + dstStride[1] * chrDstY,
2611             dst[2] + dstStride[2] * chrDstY,
2612             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2613         };
2614         int use_mmx_vfilter= c->use_mmx_vfilter;
2615
2616         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2617         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2618         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2619         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2620         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2621         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2622         int enough_lines;
2623
2624         //handle holes (FAST_BILINEAR & weird filters)
2625         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2626         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2627         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2628         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2629
2630         DEBUG_BUFFERS("dstY: %d\n", dstY);
2631         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2632                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2633         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2634                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2635
2636         // Do we have enough lines in this slice to output the dstY line
2637         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2638
2639         if (!enough_lines) {
2640             lastLumSrcY = srcSliceY + srcSliceH - 1;
2641             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2642             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2643                                             lastLumSrcY, lastChrSrcY);
2644         }
2645
2646         //Do horizontal scaling
2647         while(lastInLumBuf < lastLumSrcY) {
2648             const uint8_t *src1[4] = {
2649                 src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
2650                 src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
2651                 src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
2652                 src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
2653             };
2654             lumBufIndex++;
2655             assert(lumBufIndex < 2*vLumBufSize);
2656             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2657             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2658             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2659                     hLumFilter, hLumFilterPos, hLumFilterSize,
2660                     formatConvBuffer,
2661                     pal, 0);
2662             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2663                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src1, srcW,
2664                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2665                         formatConvBuffer,
2666                         pal, 1);
2667             lastInLumBuf++;
2668             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2669                                lumBufIndex,    lastInLumBuf);
2670         }
2671         while(lastInChrBuf < lastChrSrcY) {
2672             const uint8_t *src1[4] = {
2673                 src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
2674                 src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
2675                 src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
2676                 src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
2677             };
2678             chrBufIndex++;
2679             assert(chrBufIndex < 2*vChrBufSize);
2680             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2681             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2682             //FIXME replace parameters through context struct (some at least)
2683
2684             if (c->needs_hcscale)
2685                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2686                           chrDstW, src1, chrSrcW, chrXInc,
2687                           hChrFilter, hChrFilterPos, hChrFilterSize,
2688                           formatConvBuffer, pal);
2689             lastInChrBuf++;
2690             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2691                                chrBufIndex,    lastInChrBuf);
2692         }
2693         //wrap buf index around to stay inside the ring buffer
2694         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2695         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2696         if (!enough_lines)
2697             break; //we can't output a dstY line so let's try with the next slice
2698
2699 #if HAVE_MMX
2700         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2701 #endif
2702         if (should_dither) {
2703             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2704             c->lumDither8 = dither_8x8_128[dstY & 7];
2705         }
2706         if (dstY >= dstH-2) {
2707             // hmm looks like we can't use MMX here without overwriting this array's tail
2708             find_c_packed_planar_out_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
2709                                            &yuv2packed1, &yuv2packed2, &yuv2packedX);
2710             use_mmx_vfilter= 0;
2711         }
2712
2713         {
2714             const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2715             const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2716             const int16_t **chrVSrcPtr= (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2717             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2718             int16_t *vLumFilter= c->vLumFilter;
2719             int16_t *vChrFilter= c->vChrFilter;
2720
2721             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2722                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2723
2724                 vLumFilter +=    dstY * vLumFilterSize;
2725                 vChrFilter += chrDstY * vChrFilterSize;
2726
2727                 av_assert0(use_mmx_vfilter != (
2728                                yuv2planeX == yuv2planeX_10BE_c
2729                             || yuv2planeX == yuv2planeX_10LE_c
2730                             || yuv2planeX == yuv2planeX_9BE_c
2731                             || yuv2planeX == yuv2planeX_9LE_c
2732                             || yuv2planeX == yuv2planeX_16BE_c
2733                             || yuv2planeX == yuv2planeX_16LE_c
2734                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
2735
2736                 if(use_mmx_vfilter){
2737                     vLumFilter= c->lumMmxFilter;
2738                     vChrFilter= c->chrMmxFilter;
2739                 }
2740
2741                 if (vLumFilterSize == 1) {
2742                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
2743                 } else {
2744                     yuv2planeX(vLumFilter, vLumFilterSize,
2745                                lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
2746                 }
2747
2748                 if (!((dstY&chrSkipMask) || isGray(dstFormat))) {
2749                     if (yuv2nv12cX) {
2750                         yuv2nv12cX(c, vChrFilter, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
2751                     } else if (vChrFilterSize == 1) {
2752                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
2753                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
2754                     } else {
2755                         yuv2planeX(vChrFilter, vChrFilterSize,
2756                                    chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
2757                         yuv2planeX(vChrFilter, vChrFilterSize,
2758                                    chrVSrcPtr, dest[2], chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
2759                     }
2760                 }
2761
2762                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2763                     if(use_mmx_vfilter){
2764                         vLumFilter= c->alpMmxFilter;
2765                     }
2766                     if (vLumFilterSize == 1) {
2767                         yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0);
2768                     } else {
2769                         yuv2planeX(vLumFilter, vLumFilterSize,
2770                                    alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
2771                     }
2772                 }
2773             } else {
2774                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2775                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2776                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2777                     int chrAlpha = vChrFilter[2 * dstY + 1];
2778                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2779                                 alpPixBuf ? *alpSrcPtr : NULL,
2780                                 dest[0], dstW, chrAlpha, dstY);
2781                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2782                     int lumAlpha = vLumFilter[2 * dstY + 1];
2783                     int chrAlpha = vChrFilter[2 * dstY + 1];
2784                     lumMmxFilter[2] =
2785                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2786                     chrMmxFilter[2] =
2787                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2788                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2789                                 alpPixBuf ? alpSrcPtr : NULL,
2790                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2791                 } else { //general RGB
2792                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2793                                 lumSrcPtr, vLumFilterSize,
2794                                 vChrFilter + dstY * vChrFilterSize,
2795                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2796                                 alpSrcPtr, dest[0], dstW, dstY);
2797                 }
2798             }
2799         }
2800     }
2801
2802     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2803         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2804
2805 #if HAVE_MMX2
2806     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2807         __asm__ volatile("sfence":::"memory");
2808 #endif
2809     emms_c();
2810
2811     /* store changed local vars back in the context */
2812     c->dstY= dstY;
2813     c->lumBufIndex= lumBufIndex;
2814     c->chrBufIndex= chrBufIndex;
2815     c->lastInLumBuf= lastInLumBuf;
2816     c->lastInChrBuf= lastInChrBuf;
2817
2818     return dstY - lastDstY;
2819 }
2820
2821 static av_cold void sws_init_swScale_c(SwsContext *c)
2822 {
2823     enum PixelFormat srcFormat = c->srcFormat;
2824
2825     find_c_packed_planar_out_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
2826                                    &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2,
2827                                    &c->yuv2packedX);
2828
2829     c->chrToYV12 = NULL;
2830     switch(srcFormat) {
2831         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2832         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2833         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2834         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2835         case PIX_FMT_RGB8     :
2836         case PIX_FMT_BGR8     :
2837         case PIX_FMT_PAL8     :
2838         case PIX_FMT_BGR4_BYTE:
2839         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2840         case PIX_FMT_GBRP9LE:
2841         case PIX_FMT_GBRP10LE:
2842         case PIX_FMT_GBRP16LE:  c->readChrPlanar = planar_rgb16le_to_uv; break;
2843         case PIX_FMT_GBRP9BE:
2844         case PIX_FMT_GBRP10BE:
2845         case PIX_FMT_GBRP16BE:  c->readChrPlanar = planar_rgb16be_to_uv; break;
2846         case PIX_FMT_GBRP:      c->readChrPlanar = planar_rgb_to_uv; break;
2847 #if HAVE_BIGENDIAN
2848         case PIX_FMT_YUV444P9LE:
2849         case PIX_FMT_YUV422P9LE:
2850         case PIX_FMT_YUV420P9LE:
2851         case PIX_FMT_YUV422P10LE:
2852         case PIX_FMT_YUV420P10LE:
2853         case PIX_FMT_YUV444P10LE:
2854         case PIX_FMT_YUV420P16LE:
2855         case PIX_FMT_YUV422P16LE:
2856         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2857 #else
2858         case PIX_FMT_YUV444P9BE:
2859         case PIX_FMT_YUV422P9BE:
2860         case PIX_FMT_YUV420P9BE:
2861         case PIX_FMT_YUV444P10BE:
2862         case PIX_FMT_YUV422P10BE:
2863         case PIX_FMT_YUV420P10BE:
2864         case PIX_FMT_YUV420P16BE:
2865         case PIX_FMT_YUV422P16BE:
2866         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2867 #endif
2868     }
2869     if (c->chrSrcHSubSample) {
2870         switch(srcFormat) {
2871         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2872         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2873         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2874         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2875         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2876         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2877         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2878         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2879         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2880         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2881         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2882         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2883         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2884         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2885         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2886         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2887         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2888         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2889         case PIX_FMT_GBR24P  : c->chrToYV12 = gbr24pToUV_half_c;  break;
2890         }
2891     } else {
2892         switch(srcFormat) {
2893         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2894         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2895         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2896         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2897         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2898         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2899         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2900         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2901         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2902         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2903         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2904         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2905         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2906         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2907         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2908         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2909         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2910         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2911         }
2912     }
2913
2914     c->lumToYV12 = NULL;
2915     c->alpToYV12 = NULL;
2916     switch (srcFormat) {
2917     case PIX_FMT_GBRP9LE:
2918     case PIX_FMT_GBRP10LE:
2919     case PIX_FMT_GBRP16LE: c->readLumPlanar = planar_rgb16le_to_y; break;
2920     case PIX_FMT_GBRP9BE:
2921     case PIX_FMT_GBRP10BE:
2922     case PIX_FMT_GBRP16BE: c->readLumPlanar = planar_rgb16be_to_y; break;
2923     case PIX_FMT_GBRP:     c->readLumPlanar = planar_rgb_to_y; break;
2924 #if HAVE_BIGENDIAN
2925     case PIX_FMT_YUV444P9LE:
2926     case PIX_FMT_YUV422P9LE:
2927     case PIX_FMT_YUV420P9LE:
2928     case PIX_FMT_YUV422P10LE:
2929     case PIX_FMT_YUV420P10LE:
2930     case PIX_FMT_YUV444P10LE:
2931     case PIX_FMT_YUV420P16LE:
2932     case PIX_FMT_YUV422P16LE:
2933     case PIX_FMT_YUV444P16LE:
2934     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2935 #else
2936     case PIX_FMT_YUV444P9BE:
2937     case PIX_FMT_YUV422P9BE:
2938     case PIX_FMT_YUV420P9BE:
2939     case PIX_FMT_YUV444P10BE:
2940     case PIX_FMT_YUV422P10BE:
2941     case PIX_FMT_YUV420P10BE:
2942     case PIX_FMT_YUV420P16BE:
2943     case PIX_FMT_YUV422P16BE:
2944     case PIX_FMT_YUV444P16BE:
2945     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2946 #endif
2947     case PIX_FMT_YUYV422  :
2948     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2949     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2950     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2951     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2952     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2953     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2954     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2955     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2956     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2957     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2958     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2959     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2960     case PIX_FMT_RGB8     :
2961     case PIX_FMT_BGR8     :
2962     case PIX_FMT_PAL8     :
2963     case PIX_FMT_BGR4_BYTE:
2964     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2965     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2966     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2967     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2968     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2969     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2970     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2971     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2972     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2973     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2974     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2975     }
2976     if (c->alpPixBuf) {
2977         switch (srcFormat) {
2978         case PIX_FMT_BGRA:
2979         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2980         case PIX_FMT_ABGR:
2981         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2982         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2983         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
2984         }
2985     }
2986
2987
2988     if (c->srcBpc == 8) {
2989         if (c->dstBpc <= 10) {
2990             c->hyScale = c->hcScale = hScale8To15_c;
2991             if (c->flags & SWS_FAST_BILINEAR) {
2992                 c->hyscale_fast = hyscale_fast_c;
2993                 c->hcscale_fast = hcscale_fast_c;
2994             }
2995         } else {
2996             c->hyScale = c->hcScale = hScale8To19_c;
2997         }
2998     } else {
2999         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
3000     }
3001
3002     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3003         if (c->dstBpc <= 10) {
3004             if (c->srcRange) {
3005                 c->lumConvertRange = lumRangeFromJpeg_c;
3006                 c->chrConvertRange = chrRangeFromJpeg_c;
3007             } else {
3008                 c->lumConvertRange = lumRangeToJpeg_c;
3009                 c->chrConvertRange = chrRangeToJpeg_c;
3010             }
3011         } else {
3012             if (c->srcRange) {
3013                 c->lumConvertRange = lumRangeFromJpeg16_c;
3014                 c->chrConvertRange = chrRangeFromJpeg16_c;
3015             } else {
3016                 c->lumConvertRange = lumRangeToJpeg16_c;
3017                 c->chrConvertRange = chrRangeToJpeg16_c;
3018             }
3019         }
3020     }
3021
3022     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3023           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3024         c->needs_hcscale = 1;
3025 }
3026
3027 SwsFunc ff_getSwsFunc(SwsContext *c)
3028 {
3029     sws_init_swScale_c(c);
3030
3031     if (HAVE_MMX)
3032         ff_sws_init_swScale_mmx(c);
3033     if (HAVE_ALTIVEC)
3034         ff_sws_init_swScale_altivec(c);
3035
3036     return swScale;
3037 }