git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include <inttypes.h>
  22 #include <string.h>
  23 #include <math.h>
  24 #include <stdio.h>
  25 #include "config.h"
  26 #include <assert.h>
  27 #include "swscale.h"
  28 #include "swscale_internal.h"
  29 #include "rgb2rgb.h"
  30 #include "libavutil/avassert.h"
  31 #include "libavutil/intreadwrite.h"
  32 #include "libavutil/cpu.h"
  33 #include "libavutil/avutil.h"
  34 #include "libavutil/mathematics.h"
  35 #include "libavutil/bswap.h"
  36 #include "libavutil/pixdesc.h"
  37
  38
  39 #define RGB2YUV_SHIFT 15
  40 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  41 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  42 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  43 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  44 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  45 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  46 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  47 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  48 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  49
  50 /*
  51 NOTES
  52 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  53
  54 TODO
  55 more intelligent misalignment avoidance for the horizontal scaler
  56 write special vertical cubic upscale version
  57 optimize C code (YV12 / minmax)
  58 add support for packed pixel YUV input & output
  59 add support for Y8 output
  60 optimize BGR24 & BGR32
  61 add BGR4 output support
  62 write special BGR->BGR scaler
  63 */
  64
  65 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  66 {  1,   3,   1,   3,   1,   3,   1,   3, },
  67 {  2,   0,   2,   0,   2,   0,   2,   0, },
  68 };
  69
  70 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
  71 {  6,   2,   6,   2,   6,   2,   6,   2, },
  72 {  0,   4,   0,   4,   0,   4,   0,   4, },
  73 };
  74
  75 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
  76 {  8,   4,  11,   7,   8,   4,  11,   7, },
  77 {  2,  14,   1,  13,   2,  14,   1,  13, },
  78 { 10,   6,   9,   5,  10,   6,   9,   5, },
  79 {  0,  12,   3,  15,   0,  12,   3,  15, },
  80 };
  81
  82 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
  83 { 17,   9,  23,  15,  16,   8,  22,  14, },
  84 {  5,  29,   3,  27,   4,  28,   2,  26, },
  85 { 21,  13,  19,  11,  20,  12,  18,  10, },
  86 {  0,  24,   6,  30,   1,  25,   7,  31, },
  87 { 16,   8,  22,  14,  17,   9,  23,  15, },
  88 {  4,  28,   2,  26,   5,  29,   3,  27, },
  89 { 20,  12,  18,  10,  21,  13,  19,  11, },
  90 {  1,  25,   7,  31,   0,  24,   6,  30, },
  91 };
  92
  93 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
  94 {  0,  55,  14,  68,   3,  58,  17,  72, },
  95 { 37,  18,  50,  32,  40,  22,  54,  35, },
  96 {  9,  64,   5,  59,  13,  67,   8,  63, },
  97 { 46,  27,  41,  23,  49,  31,  44,  26, },
  98 {  2,  57,  16,  71,   1,  56,  15,  70, },
  99 { 39,  21,  52,  34,  38,  19,  51,  33, },
 100 { 11,  66,   7,  62,  10,  65,   6,  60, },
 101 { 48,  30,  43,  25,  47,  29,  42,  24, },
 102 };
 103
 104 #if 1
 105 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 106 {117,  62, 158, 103, 113,  58, 155, 100, },
 107 { 34, 199,  21, 186,  31, 196,  17, 182, },
 108 {144,  89, 131,  76, 141,  86, 127,  72, },
 109 {  0, 165,  41, 206,  10, 175,  52, 217, },
 110 {110,  55, 151,  96, 120,  65, 162, 107, },
 111 { 28, 193,  14, 179,  38, 203,  24, 189, },
 112 {138,  83, 124,  69, 148,  93, 134,  79, },
 113 {  7, 172,  48, 213,   3, 168,  45, 210, },
 114 };
 115 #elif 1
 116 // tries to correct a gamma of 1.5
 117 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 118 {  0, 143,  18, 200,   2, 156,  25, 215, },
 119 { 78,  28, 125,  64,  89,  36, 138,  74, },
 120 { 10, 180,   3, 161,  16, 195,   8, 175, },
 121 {109,  51,  93,  38, 121,  60, 105,  47, },
 122 {  1, 152,  23, 210,   0, 147,  20, 205, },
 123 { 85,  33, 134,  71,  81,  30, 130,  67, },
 124 { 14, 190,   6, 171,  12, 185,   5, 166, },
 125 {117,  57, 101,  44, 113,  54,  97,  41, },
 126 };
 127 #elif 1
 128 // tries to correct a gamma of 2.0
 129 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 130 {  0, 124,   8, 193,   0, 140,  12, 213, },
 131 { 55,  14, 104,  42,  66,  19, 119,  52, },
 132 {  3, 168,   1, 145,   6, 187,   3, 162, },
 133 { 86,  31,  70,  21,  99,  39,  82,  28, },
 134 {  0, 134,  11, 206,   0, 129,   9, 200, },
 135 { 62,  17, 114,  48,  58,  16, 109,  45, },
 136 {  5, 181,   2, 157,   4, 175,   1, 151, },
 137 { 95,  36,  78,  26,  90,  34,  74,  24, },
 138 };
 139 #else
 140 // tries to correct a gamma of 2.5
 141 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 142 {  0, 107,   3, 187,   0, 125,   6, 212, },
 143 { 39,   7,  86,  28,  49,  11, 102,  36, },
 144 {  1, 158,   0, 131,   3, 180,   1, 151, },
 145 { 68,  19,  52,  12,  81,  25,  64,  17, },
 146 {  0, 119,   5, 203,   0, 113,   4, 195, },
 147 { 45,   9,  96,  33,  42,   8,  91,  30, },
 148 {  2, 172,   1, 144,   2, 165,   0, 137, },
 149 { 77,  23,  60,  15,  72,  21,  56,  14, },
 150 };
 151 #endif
 152 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 153 {  36, 68, 60, 92, 34, 66, 58, 90,},
 154 { 100,  4,124, 28, 98,  2,122, 26,},
 155 {  52, 84, 44, 76, 50, 82, 42, 74,},
 156 { 116, 20,108, 12,114, 18,106, 10,},
 157 {  32, 64, 56, 88, 38, 70, 62, 94,},
 158 {  96,  0,120, 24,102,  6,126, 30,},
 159 {  48, 80, 40, 72, 54, 86, 46, 78,},
 160 { 112, 16,104,  8,118, 22,110, 14,},
 161 };
 162 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 163 {  64, 64, 64, 64, 64, 64, 64, 64 };
 164
 165 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 166 {
 167   {   0,  1,  0,  1,  0,  1,  0,  1,},
 168   {   1,  0,  1,  0,  1,  0,  1,  0,},
 169   {   0,  1,  0,  1,  0,  1,  0,  1,},
 170   {   1,  0,  1,  0,  1,  0,  1,  0,},
 171   {   0,  1,  0,  1,  0,  1,  0,  1,},
 172   {   1,  0,  1,  0,  1,  0,  1,  0,},
 173   {   0,  1,  0,  1,  0,  1,  0,  1,},
 174   {   1,  0,  1,  0,  1,  0,  1,  0,},
 175 },{
 176   {   1,  2,  1,  2,  1,  2,  1,  2,},
 177   {   3,  0,  3,  0,  3,  0,  3,  0,},
 178   {   1,  2,  1,  2,  1,  2,  1,  2,},
 179   {   3,  0,  3,  0,  3,  0,  3,  0,},
 180   {   1,  2,  1,  2,  1,  2,  1,  2,},
 181   {   3,  0,  3,  0,  3,  0,  3,  0,},
 182   {   1,  2,  1,  2,  1,  2,  1,  2,},
 183   {   3,  0,  3,  0,  3,  0,  3,  0,},
 184 },{
 185   {   2,  4,  3,  5,  2,  4,  3,  5,},
 186   {   6,  0,  7,  1,  6,  0,  7,  1,},
 187   {   3,  5,  2,  4,  3,  5,  2,  4,},
 188   {   7,  1,  6,  0,  7,  1,  6,  0,},
 189   {   2,  4,  3,  5,  2,  4,  3,  5,},
 190   {   6,  0,  7,  1,  6,  0,  7,  1,},
 191   {   3,  5,  2,  4,  3,  5,  2,  4,},
 192   {   7,  1,  6,  0,  7,  1,  6,  0,},
 193 },{
 194   {   4,  8,  7, 11,  4,  8,  7, 11,},
 195   {  12,  0, 15,  3, 12,  0, 15,  3,},
 196   {   6, 10,  5,  9,  6, 10,  5,  9,},
 197   {  14,  2, 13,  1, 14,  2, 13,  1,},
 198   {   4,  8,  7, 11,  4,  8,  7, 11,},
 199   {  12,  0, 15,  3, 12,  0, 15,  3,},
 200   {   6, 10,  5,  9,  6, 10,  5,  9,},
 201   {  14,  2, 13,  1, 14,  2, 13,  1,},
 202 },{
 203   {   9, 17, 15, 23,  8, 16, 14, 22,},
 204   {  25,  1, 31,  7, 24,  0, 30,  6,},
 205   {  13, 21, 11, 19, 12, 20, 10, 18,},
 206   {  29,  5, 27,  3, 28,  4, 26,  2,},
 207   {   8, 16, 14, 22,  9, 17, 15, 23,},
 208   {  24,  0, 30,  6, 25,  1, 31,  7,},
 209   {  12, 20, 10, 18, 13, 21, 11, 19,},
 210   {  28,  4, 26,  2, 29,  5, 27,  3,},
 211 },{
 212   {  18, 34, 30, 46, 17, 33, 29, 45,},
 213   {  50,  2, 62, 14, 49,  1, 61, 13,},
 214   {  26, 42, 22, 38, 25, 41, 21, 37,},
 215   {  58, 10, 54,  6, 57,  9, 53,  5,},
 216   {  16, 32, 28, 44, 19, 35, 31, 47,},
 217   {  48,  0, 60, 12, 51,  3, 63, 15,},
 218   {  24, 40, 20, 36, 27, 43, 23, 39,},
 219   {  56,  8, 52,  4, 59, 11, 55,  7,},
 220 },{
 221   {  18, 34, 30, 46, 17, 33, 29, 45,},
 222   {  50,  2, 62, 14, 49,  1, 61, 13,},
 223   {  26, 42, 22, 38, 25, 41, 21, 37,},
 224   {  58, 10, 54,  6, 57,  9, 53,  5,},
 225   {  16, 32, 28, 44, 19, 35, 31, 47,},
 226   {  48,  0, 60, 12, 51,  3, 63, 15,},
 227   {  24, 40, 20, 36, 27, 43, 23, 39,},
 228   {  56,  8, 52,  4, 59, 11, 55,  7,},
 229 },{
 230   {  36, 68, 60, 92, 34, 66, 58, 90,},
 231   { 100,  4,124, 28, 98,  2,122, 26,},
 232   {  52, 84, 44, 76, 50, 82, 42, 74,},
 233   { 116, 20,108, 12,114, 18,106, 10,},
 234   {  32, 64, 56, 88, 38, 70, 62, 94,},
 235   {  96,  0,120, 24,102,  6,126, 30,},
 236   {  48, 80, 40, 72, 54, 86, 46, 78,},
 237   { 112, 16,104,  8,118, 22,110, 14,},
 238 }};
 239
 240 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 241
 242 const uint16_t dither_scale[15][16]={
 243 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 244 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 245 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 246 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 247 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 248 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 249 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 250 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 251 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 252 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 253 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 254 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 255 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 256 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 257 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 258 };
 259
 260 #define output_pixel(pos, val, bias, signedness) \
 261     if (big_endian) { \
 262         AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 263     } else { \
 264         AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 265     }
 266
 267 static av_always_inline void
 268 yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
 269                          int big_endian, int output_bits)
 270 {
 271     int i;
 272     int shift = 3;
 273     av_assert0(output_bits == 16);
 274
 275     for (i = 0; i < dstW; i++) {
 276         int val = src[i] + (1 << (shift - 1));
 277         output_pixel(&dest[i], val, 0, uint);
 278     }
 279 }
 280
 281 static av_always_inline void
 282 yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
 283                          const int32_t **src, uint16_t *dest, int dstW,
 284                          int big_endian, int output_bits)
 285 {
 286     int i;
 287     int shift = 15;
 288     av_assert0(output_bits == 16);
 289
 290     for (i = 0; i < dstW; i++) {
 291         int val = 1 << (shift - 1);
 292         int j;
 293
 294         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
 295          * filters (or anything with negative coeffs, the range can be slightly
 296          * wider in both directions. To account for this overflow, we subtract
 297          * a constant so it always fits in the signed range (assuming a
 298          * reasonable filterSize), and re-add that at the end. */
 299         val -= 0x40000000;
 300         for (j = 0; j < filterSize; j++)
 301             val += src[j][i] * filter[j];
 302
 303         output_pixel(&dest[i], val, 0x8000, int);
 304     }
 305 }
 306
 307 #undef output_pixel
 308
 309 #define output_pixel(pos, val) \
 310     if (big_endian) { \
 311         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 312     } else { \
 313         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 314     }
 315
 316 static av_always_inline void
 317 yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
 318                          int big_endian, int output_bits)
 319 {
 320     int i;
 321     int shift = 15 - output_bits;
 322
 323     for (i = 0; i < dstW; i++) {
 324         int val = src[i] + (1 << (shift - 1));
 325         output_pixel(&dest[i], val);
 326     }
 327 }
 328
 329 static av_always_inline void
 330 yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
 331                          const int16_t **src, uint16_t *dest, int dstW,
 332                          int big_endian, int output_bits)
 333 {
 334     int i;
 335     int shift = 11 + 16 - output_bits;
 336
 337     for (i = 0; i < dstW; i++) {
 338         int val = 1 << (shift - 1);
 339         int j;
 340
 341         for (j = 0; j < filterSize; j++)
 342             val += src[j][i] * filter[j];
 343
 344         output_pixel(&dest[i], val);
 345     }
 346 }
 347
 348 #undef output_pixel
 349
 350 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
 351 static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
 352                               uint8_t *dest, int dstW, \
 353                               const uint8_t *dither, int offset)\
 354 { \
 355     yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
 356                          (uint16_t *) dest, dstW, is_be, bits); \
 357 }\
 358 static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
 359                               const int16_t **src, uint8_t *dest, int dstW, \
 360                               const uint8_t *dither, int offset)\
 361 { \
 362     yuv2planeX_## template_size ## _c_template(filter, \
 363                          filterSize, (const typeX_t **) src, \
 364                          (uint16_t *) dest, dstW, is_be, bits); \
 365 }
 366 yuv2NBPS( 9, BE, 1, 10, int16_t)
 367 yuv2NBPS( 9, LE, 0, 10, int16_t)
 368 yuv2NBPS(10, BE, 1, 10, int16_t)
 369 yuv2NBPS(10, LE, 0, 10, int16_t)
 370 yuv2NBPS(16, BE, 1, 16, int32_t)
 371 yuv2NBPS(16, LE, 0, 16, int32_t)
 372
 373 static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
 374                            const int16_t **src, uint8_t *dest, int dstW,
 375                            const uint8_t *dither, int offset)
 376 {
 377     int i;
 378     for (i=0; i<dstW; i++) {
 379         int val = dither[(i + offset) & 7] << 12;
 380         int j;
 381         for (j=0; j<filterSize; j++)
 382             val += src[j][i] * filter[j];
 383
 384         dest[i]= av_clip_uint8(val>>19);
 385     }
 386 }
 387
 388 static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
 389                            const uint8_t *dither, int offset)
 390 {
 391     int i;
 392     for (i=0; i<dstW; i++) {
 393         int val = (src[i] + dither[(i + offset) & 7]) >> 7;
 394         dest[i]= av_clip_uint8(val);
 395     }
 396 }
 397
 398 static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
 399                         const int16_t **chrUSrc, const int16_t **chrVSrc,
 400                         uint8_t *dest, int chrDstW)
 401 {
 402     enum PixelFormat dstFormat = c->dstFormat;
 403     const uint8_t *chrDither = c->chrDither8;
 404     int i;
 405
 406     if (dstFormat == PIX_FMT_NV12)
 407         for (i=0; i<chrDstW; i++) {
 408             int u = chrDither[i & 7] << 12;
 409             int v = chrDither[(i + 3) & 7] << 12;
 410             int j;
 411             for (j=0; j<chrFilterSize; j++) {
 412                 u += chrUSrc[j][i] * chrFilter[j];
 413                 v += chrVSrc[j][i] * chrFilter[j];
 414             }
 415
 416             dest[2*i]= av_clip_uint8(u>>19);
 417             dest[2*i+1]= av_clip_uint8(v>>19);
 418         }
 419     else
 420         for (i=0; i<chrDstW; i++) {
 421             int u = chrDither[i & 7] << 12;
 422             int v = chrDither[(i + 3) & 7] << 12;
 423             int j;
 424             for (j=0; j<chrFilterSize; j++) {
 425                 u += chrUSrc[j][i] * chrFilter[j];
 426                 v += chrVSrc[j][i] * chrFilter[j];
 427             }
 428
 429             dest[2*i]= av_clip_uint8(v>>19);
 430             dest[2*i+1]= av_clip_uint8(u>>19);
 431         }
 432 }
 433
 434 #define output_pixel(pos, val) \
 435         if (target == PIX_FMT_GRAY16BE) { \
 436             AV_WB16(pos, val); \
 437         } else { \
 438             AV_WL16(pos, val); \
 439         }
 440
 441 static av_always_inline void
 442 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 443                         const int32_t **lumSrc, int lumFilterSize,
 444                         const int16_t *chrFilter, const int32_t **chrUSrc,
 445                         const int32_t **chrVSrc, int chrFilterSize,
 446                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 447                         int y, enum PixelFormat target)
 448 {
 449     int i;
 450
 451     for (i = 0; i < (dstW >> 1); i++) {
 452         int j;
 453         int Y1 = (1 << 14) - 0x40000000;
 454         int Y2 = (1 << 14) - 0x40000000;
 455
 456         for (j = 0; j < lumFilterSize; j++) {
 457             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 458             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 459         }
 460         Y1 >>= 15;
 461         Y2 >>= 15;
 462         Y1 = av_clip_int16(Y1);
 463         Y2 = av_clip_int16(Y2);
 464         output_pixel(&dest[i * 2 + 0], 0x8000 + Y1);
 465         output_pixel(&dest[i * 2 + 1], 0x8000 + Y2);
 466     }
 467 }
 468
 469 static av_always_inline void
 470 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 471                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 472                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 473                         int yalpha, int uvalpha, int y,
 474                         enum PixelFormat target)
 475 {
 476     int  yalpha1 = 4095 - yalpha;
 477     int i;
 478     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 479
 480     for (i = 0; i < (dstW >> 1); i++) {
 481         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 482         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 483
 484         output_pixel(&dest[i * 2 + 0], Y1);
 485         output_pixel(&dest[i * 2 + 1], Y2);
 486     }
 487 }
 488
 489 static av_always_inline void
 490 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 491                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 492                         const int32_t *abuf0, uint16_t *dest, int dstW,
 493                         int uvalpha, int y, enum PixelFormat target)
 494 {
 495     int i;
 496
 497     for (i = 0; i < (dstW >> 1); i++) {
 498         int Y1 = (buf0[i * 2    ]+4)>>3;
 499         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 500
 501         output_pixel(&dest[i * 2 + 0], Y1);
 502         output_pixel(&dest[i * 2 + 1], Y2);
 503     }
 504 }
 505
 506 #undef output_pixel
 507
 508 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 509 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 510                         const int16_t **_lumSrc, int lumFilterSize, \
 511                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 512                         const int16_t **_chrVSrc, int chrFilterSize, \
 513                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 514                         int y) \
 515 { \
 516     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 517                   **chrUSrc = (const int32_t **) _chrUSrc, \
 518                   **chrVSrc = (const int32_t **) _chrVSrc, \
 519                   **alpSrc  = (const int32_t **) _alpSrc; \
 520     uint16_t *dest = (uint16_t *) _dest; \
 521     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 522                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 523                           alpSrc, dest, dstW, y, fmt); \
 524 } \
 525  \
 526 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 527                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 528                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 529                         int yalpha, int uvalpha, int y) \
 530 { \
 531     const int32_t **buf  = (const int32_t **) _buf, \
 532                   **ubuf = (const int32_t **) _ubuf, \
 533                   **vbuf = (const int32_t **) _vbuf, \
 534                   **abuf = (const int32_t **) _abuf; \
 535     uint16_t *dest = (uint16_t *) _dest; \
 536     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 537                           dest, dstW, yalpha, uvalpha, y, fmt); \
 538 } \
 539  \
 540 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 541                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 542                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 543                         int uvalpha, int y) \
 544 { \
 545     const int32_t *buf0  = (const int32_t *)  _buf0, \
 546                  **ubuf  = (const int32_t **) _ubuf, \
 547                  **vbuf  = (const int32_t **) _vbuf, \
 548                   *abuf0 = (const int32_t *)  _abuf0; \
 549     uint16_t *dest = (uint16_t *) _dest; \
 550     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 551                                   dstW, uvalpha, y, fmt); \
 552 }
 553
 554 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE)
 555 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE)
 556
 557 #define output_pixel(pos, acc) \
 558     if (target == PIX_FMT_MONOBLACK) { \
 559         pos = acc; \
 560     } else { \
 561         pos = ~acc; \
 562     }
 563
 564 static av_always_inline void
 565 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 566                       const int16_t **lumSrc, int lumFilterSize,
 567                       const int16_t *chrFilter, const int16_t **chrUSrc,
 568                       const int16_t **chrVSrc, int chrFilterSize,
 569                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 570                       int y, enum PixelFormat target)
 571 {
 572     const uint8_t * const d128=dither_8x8_220[y&7];
 573     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 574     int i;
 575     unsigned acc = 0;
 576
 577     for (i = 0; i < dstW - 1; i += 2) {
 578         int j;
 579         int Y1 = 1 << 18;
 580         int Y2 = 1 << 18;
 581
 582         for (j = 0; j < lumFilterSize; j++) {
 583             Y1 += lumSrc[j][i]   * lumFilter[j];
 584             Y2 += lumSrc[j][i+1] * lumFilter[j];
 585         }
 586         Y1 >>= 19;
 587         Y2 >>= 19;
 588         if ((Y1 | Y2) & 0x100) {
 589             Y1 = av_clip_uint8(Y1);
 590             Y2 = av_clip_uint8(Y2);
 591         }
 592         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 593         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 594         if ((i & 7) == 6) {
 595             output_pixel(*dest++, acc);
 596         }
 597     }
 598 }
 599
 600 static av_always_inline void
 601 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 602                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 603                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 604                       int yalpha, int uvalpha, int y,
 605                       enum PixelFormat target)
 606 {
 607     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 608     const uint8_t * const d128 = dither_8x8_220[y & 7];
 609     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 610     int  yalpha1 = 4095 - yalpha;
 611     int i;
 612
 613     for (i = 0; i < dstW - 7; i += 8) {
 614         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 615         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 616         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 617         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 618         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 619         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 620         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 621         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 622         output_pixel(*dest++, acc);
 623     }
 624 }
 625
 626 static av_always_inline void
 627 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 628                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 629                       const int16_t *abuf0, uint8_t *dest, int dstW,
 630                       int uvalpha, int y, enum PixelFormat target)
 631 {
 632     const uint8_t * const d128 = dither_8x8_220[y & 7];
 633     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 634     int i;
 635
 636     for (i = 0; i < dstW - 7; i += 8) {
 637         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 638         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 639         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 640         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 641         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 642         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 643         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 644         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 645         output_pixel(*dest++, acc);
 646     }
 647 }
 648
 649 #undef output_pixel
 650
 651 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 652 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 653                                 const int16_t **lumSrc, int lumFilterSize, \
 654                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 655                                 const int16_t **chrVSrc, int chrFilterSize, \
 656                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 657                                 int y) \
 658 { \
 659     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 660                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 661                                   alpSrc, dest, dstW, y, fmt); \
 662 } \
 663  \
 664 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 665                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 666                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 667                                 int yalpha, int uvalpha, int y) \
 668 { \
 669     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 670                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 671 } \
 672  \
 673 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 674                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 675                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 676                                 int uvalpha, int y) \
 677 { \
 678     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 679                                   abuf0, dest, dstW, uvalpha, \
 680                                   y, fmt); \
 681 }
 682
 683 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE)
 684 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK)
 685
 686 #define output_pixels(pos, Y1, U, Y2, V) \
 687     if (target == PIX_FMT_YUYV422) { \
 688         dest[pos + 0] = Y1; \
 689         dest[pos + 1] = U;  \
 690         dest[pos + 2] = Y2; \
 691         dest[pos + 3] = V;  \
 692     } else { \
 693         dest[pos + 0] = U;  \
 694         dest[pos + 1] = Y1; \
 695         dest[pos + 2] = V;  \
 696         dest[pos + 3] = Y2; \
 697     }
 698
 699 static av_always_inline void
 700 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 701                      const int16_t **lumSrc, int lumFilterSize,
 702                      const int16_t *chrFilter, const int16_t **chrUSrc,
 703                      const int16_t **chrVSrc, int chrFilterSize,
 704                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 705                      int y, enum PixelFormat target)
 706 {
 707     int i;
 708
 709     for (i = 0; i < (dstW >> 1); i++) {
 710         int j;
 711         int Y1 = 1 << 18;
 712         int Y2 = 1 << 18;
 713         int U  = 1 << 18;
 714         int V  = 1 << 18;
 715
 716         for (j = 0; j < lumFilterSize; j++) {
 717             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 718             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 719         }
 720         for (j = 0; j < chrFilterSize; j++) {
 721             U += chrUSrc[j][i] * chrFilter[j];
 722             V += chrVSrc[j][i] * chrFilter[j];
 723         }
 724         Y1 >>= 19;
 725         Y2 >>= 19;
 726         U  >>= 19;
 727         V  >>= 19;
 728         if ((Y1 | Y2 | U | V) & 0x100) {
 729             Y1 = av_clip_uint8(Y1);
 730             Y2 = av_clip_uint8(Y2);
 731             U  = av_clip_uint8(U);
 732             V  = av_clip_uint8(V);
 733         }
 734         output_pixels(4*i, Y1, U, Y2, V);
 735     }
 736 }
 737
 738 static av_always_inline void
 739 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 740                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 741                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 742                      int yalpha, int uvalpha, int y,
 743                      enum PixelFormat target)
 744 {
 745     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 746                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 747                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 748     int  yalpha1 = 4095 - yalpha;
 749     int uvalpha1 = 4095 - uvalpha;
 750     int i;
 751
 752     for (i = 0; i < (dstW >> 1); i++) {
 753         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 754         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 755         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 756         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 757
 758         output_pixels(i * 4, Y1, U, Y2, V);
 759     }
 760 }
 761
 762 static av_always_inline void
 763 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 764                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 765                      const int16_t *abuf0, uint8_t *dest, int dstW,
 766                      int uvalpha, int y, enum PixelFormat target)
 767 {
 768     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 769                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 770     int i;
 771
 772     if (uvalpha < 2048) {
 773         for (i = 0; i < (dstW >> 1); i++) {
 774             int Y1 = buf0[i * 2]     >> 7;
 775             int Y2 = buf0[i * 2 + 1] >> 7;
 776             int U  = ubuf1[i]        >> 7;
 777             int V  = vbuf1[i]        >> 7;
 778
 779             output_pixels(i * 4, Y1, U, Y2, V);
 780         }
 781     } else {
 782         for (i = 0; i < (dstW >> 1); i++) {
 783             int Y1 =  buf0[i * 2]          >> 7;
 784             int Y2 =  buf0[i * 2 + 1]      >> 7;
 785             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 786             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 787
 788             output_pixels(i * 4, Y1, U, Y2, V);
 789         }
 790     }
 791 }
 792
 793 #undef output_pixels
 794
 795 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422)
 796 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422)
 797
 798 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 799 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 800 #define output_pixel(pos, val) \
 801     if (isBE(target)) { \
 802         AV_WB16(pos, val); \
 803     } else { \
 804         AV_WL16(pos, val); \
 805     }
 806
 807 static av_always_inline void
 808 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 809                        const int32_t **lumSrc, int lumFilterSize,
 810                        const int16_t *chrFilter, const int32_t **chrUSrc,
 811                        const int32_t **chrVSrc, int chrFilterSize,
 812                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 813                        int y, enum PixelFormat target)
 814 {
 815     int i;
 816
 817     for (i = 0; i < (dstW >> 1); i++) {
 818         int j;
 819         int Y1 = -0x40000000;
 820         int Y2 = -0x40000000;
 821         int U  = -128 << 23; // 19
 822         int V  = -128 << 23;
 823         int R, G, B;
 824
 825         for (j = 0; j < lumFilterSize; j++) {
 826             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 827             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 828         }
 829         for (j = 0; j < chrFilterSize; j++) {
 830             U += chrUSrc[j][i] * chrFilter[j];
 831             V += chrVSrc[j][i] * chrFilter[j];
 832         }
 833
 834         // 8bit: 12+15=27; 16-bit: 12+19=31
 835         Y1 >>= 14; // 10
 836         Y1 += 0x10000;
 837         Y2 >>= 14;
 838         Y2 += 0x10000;
 839         U  >>= 14;
 840         V  >>= 14;
 841
 842         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 843         Y1 -= c->yuv2rgb_y_offset;
 844         Y2 -= c->yuv2rgb_y_offset;
 845         Y1 *= c->yuv2rgb_y_coeff;
 846         Y2 *= c->yuv2rgb_y_coeff;
 847         Y1 += 1 << 13; // 21
 848         Y2 += 1 << 13;
 849         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 850
 851         R = V * c->yuv2rgb_v2r_coeff;
 852         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 853         B =                            U * c->yuv2rgb_u2b_coeff;
 854
 855         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 856         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 857         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 858         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 859         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 860         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 861         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 862         dest += 6;
 863     }
 864 }
 865
 866 static av_always_inline void
 867 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 868                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 869                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 870                        int yalpha, int uvalpha, int y,
 871                        enum PixelFormat target)
 872 {
 873     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 874                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 875                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 876     int  yalpha1 = 4095 - yalpha;
 877     int uvalpha1 = 4095 - uvalpha;
 878     int i;
 879
 880     for (i = 0; i < (dstW >> 1); i++) {
 881         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 882         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 883         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 884         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 885         int R, G, B;
 886
 887         Y1 -= c->yuv2rgb_y_offset;
 888         Y2 -= c->yuv2rgb_y_offset;
 889         Y1 *= c->yuv2rgb_y_coeff;
 890         Y2 *= c->yuv2rgb_y_coeff;
 891         Y1 += 1 << 13;
 892         Y2 += 1 << 13;
 893
 894         R = V * c->yuv2rgb_v2r_coeff;
 895         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 896         B =                            U * c->yuv2rgb_u2b_coeff;
 897
 898         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 899         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 900         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 901         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 902         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 903         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 904         dest += 6;
 905     }
 906 }
 907
 908 static av_always_inline void
 909 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 910                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 911                        const int32_t *abuf0, uint16_t *dest, int dstW,
 912                        int uvalpha, int y, enum PixelFormat target)
 913 {
 914     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 915                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 916     int i;
 917
 918     if (uvalpha < 2048) {
 919         for (i = 0; i < (dstW >> 1); i++) {
 920             int Y1 = (buf0[i * 2]    ) >> 2;
 921             int Y2 = (buf0[i * 2 + 1]) >> 2;
 922             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 923             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 924             int R, G, B;
 925
 926             Y1 -= c->yuv2rgb_y_offset;
 927             Y2 -= c->yuv2rgb_y_offset;
 928             Y1 *= c->yuv2rgb_y_coeff;
 929             Y2 *= c->yuv2rgb_y_coeff;
 930             Y1 += 1 << 13;
 931             Y2 += 1 << 13;
 932
 933             R = V * c->yuv2rgb_v2r_coeff;
 934             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 935             B =                            U * c->yuv2rgb_u2b_coeff;
 936
 937             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 938             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 939             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 940             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 941             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 942             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 943             dest += 6;
 944         }
 945     } else {
 946         for (i = 0; i < (dstW >> 1); i++) {
 947             int Y1 = (buf0[i * 2]    ) >> 2;
 948             int Y2 = (buf0[i * 2 + 1]) >> 2;
 949             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
 950             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
 951             int R, G, B;
 952
 953             Y1 -= c->yuv2rgb_y_offset;
 954             Y2 -= c->yuv2rgb_y_offset;
 955             Y1 *= c->yuv2rgb_y_coeff;
 956             Y2 *= c->yuv2rgb_y_coeff;
 957             Y1 += 1 << 13;
 958             Y2 += 1 << 13;
 959
 960             R = V * c->yuv2rgb_v2r_coeff;
 961             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 962             B =                            U * c->yuv2rgb_u2b_coeff;
 963
 964             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 965             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 966             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 967             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 968             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 969             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 970             dest += 6;
 971         }
 972     }
 973 }
 974
 975 #undef output_pixel
 976 #undef r_b
 977 #undef b_r
 978
 979 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE)
 980 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE)
 981 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE)
 982 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE)
 983
 984 /*
 985  * Write out 2 RGB pixels in the target pixel format. This function takes a
 986  * R/G/B LUT as generated by ff_yuv2rgb_c_init_tables(), which takes care of
 987  * things like endianness conversion and shifting. The caller takes care of
 988  * setting the correct offset in these tables from the chroma (U/V) values.
 989  * This function then uses the luminance (Y1/Y2) values to write out the
 990  * correct RGB values into the destination buffer.
 991  */
 992 static av_always_inline void
 993 yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
 994               unsigned A1, unsigned A2,
 995               const void *_r, const void *_g, const void *_b, int y,
 996               enum PixelFormat target, int hasAlpha)
 997 {
 998     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 999         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1000         uint32_t *dest = (uint32_t *) _dest;
1001         const uint32_t *r = (const uint32_t *) _r;
1002         const uint32_t *g = (const uint32_t *) _g;
1003         const uint32_t *b = (const uint32_t *) _b;
1004
1005 #if CONFIG_SMALL
1006         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1007
1008         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1009         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1010 #else
1011         if (hasAlpha) {
1012             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1013
1014             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1015             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1016         } else {
1017             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1018             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1019         }
1020 #endif
1021     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1022         uint8_t *dest = (uint8_t *) _dest;
1023         const uint8_t *r = (const uint8_t *) _r;
1024         const uint8_t *g = (const uint8_t *) _g;
1025         const uint8_t *b = (const uint8_t *) _b;
1026
1027 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1028 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1029
1030         dest[i * 6 + 0] = r_b[Y1];
1031         dest[i * 6 + 1] =   g[Y1];
1032         dest[i * 6 + 2] = b_r[Y1];
1033         dest[i * 6 + 3] = r_b[Y2];
1034         dest[i * 6 + 4] =   g[Y2];
1035         dest[i * 6 + 5] = b_r[Y2];
1036 #undef r_b
1037 #undef b_r
1038     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1039                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1040                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1041         uint16_t *dest = (uint16_t *) _dest;
1042         const uint16_t *r = (const uint16_t *) _r;
1043         const uint16_t *g = (const uint16_t *) _g;
1044         const uint16_t *b = (const uint16_t *) _b;
1045         int dr1, dg1, db1, dr2, dg2, db2;
1046
1047         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1048             dr1 = dither_2x2_8[ y & 1     ][0];
1049             dg1 = dither_2x2_4[ y & 1     ][0];
1050             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1051             dr2 = dither_2x2_8[ y & 1     ][1];
1052             dg2 = dither_2x2_4[ y & 1     ][1];
1053             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1054         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1055             dr1 = dither_2x2_8[ y & 1     ][0];
1056             dg1 = dither_2x2_8[ y & 1     ][1];
1057             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1058             dr2 = dither_2x2_8[ y & 1     ][1];
1059             dg2 = dither_2x2_8[ y & 1     ][0];
1060             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1061         } else {
1062             dr1 = dither_4x4_16[ y & 3     ][0];
1063             dg1 = dither_4x4_16[ y & 3     ][1];
1064             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1065             dr2 = dither_4x4_16[ y & 3     ][1];
1066             dg2 = dither_4x4_16[ y & 3     ][0];
1067             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1068         }
1069
1070         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1071         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1072     } else /* 8/4-bit */ {
1073         uint8_t *dest = (uint8_t *) _dest;
1074         const uint8_t *r = (const uint8_t *) _r;
1075         const uint8_t *g = (const uint8_t *) _g;
1076         const uint8_t *b = (const uint8_t *) _b;
1077         int dr1, dg1, db1, dr2, dg2, db2;
1078
1079         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1080             const uint8_t * const d64 = dither_8x8_73[y & 7];
1081             const uint8_t * const d32 = dither_8x8_32[y & 7];
1082             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1083             db1 =       d64[(i * 2 + 0) & 7];
1084             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1085             db2 =       d64[(i * 2 + 1) & 7];
1086         } else {
1087             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1088             const uint8_t * const d128 = dither_8x8_220[y & 7];
1089             dr1 = db1 = d128[(i * 2 + 0) & 7];
1090             dg1 =        d64[(i * 2 + 0) & 7];
1091             dr2 = db2 = d128[(i * 2 + 1) & 7];
1092             dg2 =        d64[(i * 2 + 1) & 7];
1093         }
1094
1095         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1096             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1097                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1098         } else {
1099             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1100             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1101         }
1102     }
1103 }
1104
1105 static av_always_inline void
1106 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1107                      const int16_t **lumSrc, int lumFilterSize,
1108                      const int16_t *chrFilter, const int16_t **chrUSrc,
1109                      const int16_t **chrVSrc, int chrFilterSize,
1110                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1111                      int y, enum PixelFormat target, int hasAlpha)
1112 {
1113     int i;
1114
1115     for (i = 0; i < (dstW >> 1); i++) {
1116         int j;
1117         int Y1 = 1 << 18;
1118         int Y2 = 1 << 18;
1119         int U  = 1 << 18;
1120         int V  = 1 << 18;
1121         int av_unused A1, A2;
1122         const void *r, *g, *b;
1123
1124         for (j = 0; j < lumFilterSize; j++) {
1125             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1126             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1127         }
1128         for (j = 0; j < chrFilterSize; j++) {
1129             U += chrUSrc[j][i] * chrFilter[j];
1130             V += chrVSrc[j][i] * chrFilter[j];
1131         }
1132         Y1 >>= 19;
1133         Y2 >>= 19;
1134         U  >>= 19;
1135         V  >>= 19;
1136         if ((Y1 | Y2 | U | V) & 0x100) {
1137             Y1 = av_clip_uint8(Y1);
1138             Y2 = av_clip_uint8(Y2);
1139             U  = av_clip_uint8(U);
1140             V  = av_clip_uint8(V);
1141         }
1142         if (hasAlpha) {
1143             A1 = 1 << 18;
1144             A2 = 1 << 18;
1145             for (j = 0; j < lumFilterSize; j++) {
1146                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1147                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1148             }
1149             A1 >>= 19;
1150             A2 >>= 19;
1151             if ((A1 | A2) & 0x100) {
1152                 A1 = av_clip_uint8(A1);
1153                 A2 = av_clip_uint8(A2);
1154             }
1155         }
1156
1157         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1158         r =  c->table_rV[V];
1159         g = (c->table_gU[U] + c->table_gV[V]);
1160         b =  c->table_bU[U];
1161
1162         yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1163                       r, g, b, y, target, hasAlpha);
1164     }
1165 }
1166
1167 static av_always_inline void
1168 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1169                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1170                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1171                      int yalpha, int uvalpha, int y,
1172                      enum PixelFormat target, int hasAlpha)
1173 {
1174     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1175                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1176                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1177                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1178                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1179     int  yalpha1 = 4095 - yalpha;
1180     int uvalpha1 = 4095 - uvalpha;
1181     int i;
1182
1183     for (i = 0; i < (dstW >> 1); i++) {
1184         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1185         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1186         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1187         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1188         int A1, A2;
1189         const void *r =  c->table_rV[V],
1190                    *g = (c->table_gU[U] + c->table_gV[V]),
1191                    *b =  c->table_bU[U];
1192
1193         if (hasAlpha) {
1194             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1195             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1196         }
1197
1198         yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1199                       r, g, b, y, target, hasAlpha);
1200     }
1201 }
1202
1203 static av_always_inline void
1204 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1205                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1206                      const int16_t *abuf0, uint8_t *dest, int dstW,
1207                      int uvalpha, int y, enum PixelFormat target,
1208                      int hasAlpha)
1209 {
1210     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1211                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1212     int i;
1213
1214     if (uvalpha < 2048) {
1215         for (i = 0; i < (dstW >> 1); i++) {
1216             int Y1 = buf0[i * 2]     >> 7;
1217             int Y2 = buf0[i * 2 + 1] >> 7;
1218             int U  = ubuf1[i]        >> 7;
1219             int V  = vbuf1[i]        >> 7;
1220             int A1, A2;
1221             const void *r =  c->table_rV[V],
1222                        *g = (c->table_gU[U] + c->table_gV[V]),
1223                        *b =  c->table_bU[U];
1224
1225             if (hasAlpha) {
1226                 A1 = abuf0[i * 2    ] >> 7;
1227                 A2 = abuf0[i * 2 + 1] >> 7;
1228             }
1229
1230             yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1231                           r, g, b, y, target, hasAlpha);
1232         }
1233     } else {
1234         for (i = 0; i < (dstW >> 1); i++) {
1235             int Y1 =  buf0[i * 2]          >> 7;
1236             int Y2 =  buf0[i * 2 + 1]      >> 7;
1237             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1238             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1239             int A1, A2;
1240             const void *r =  c->table_rV[V],
1241                        *g = (c->table_gU[U] + c->table_gV[V]),
1242                        *b =  c->table_bU[U];
1243
1244             if (hasAlpha) {
1245                 A1 = abuf0[i * 2    ] >> 7;
1246                 A2 = abuf0[i * 2 + 1] >> 7;
1247             }
1248
1249             yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1250                           r, g, b, y, target, hasAlpha);
1251         }
1252     }
1253 }
1254
1255 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1256 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1257                                 const int16_t **lumSrc, int lumFilterSize, \
1258                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1259                                 const int16_t **chrVSrc, int chrFilterSize, \
1260                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1261                                 int y) \
1262 { \
1263     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1264                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1265                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1266 }
1267 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1268 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1269 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1270                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1271                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1272                                 int yalpha, int uvalpha, int y) \
1273 { \
1274     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1275                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1276 } \
1277  \
1278 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1279                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1280                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1281                                 int uvalpha, int y) \
1282 { \
1283     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1284                                   dstW, uvalpha, y, fmt, hasAlpha); \
1285 }
1286
1287 #if CONFIG_SMALL
1288 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1289 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1290 #else
1291 #if CONFIG_SWSCALE_ALPHA
1292 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1)
1293 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1)
1294 #endif
1295 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0)
1296 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0)
1297 #endif
1298 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0)
1299 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0)
1300 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0)
1301 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0)
1302 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0)
1303 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0)
1304 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0)
1305 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0)
1306
1307 static av_always_inline void
1308 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1309                           const int16_t **lumSrc, int lumFilterSize,
1310                           const int16_t *chrFilter, const int16_t **chrUSrc,
1311                           const int16_t **chrVSrc, int chrFilterSize,
1312                           const int16_t **alpSrc, uint8_t *dest,
1313                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1314 {
1315     int i;
1316     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1317
1318     for (i = 0; i < dstW; i++) {
1319         int j;
1320         int Y = 1<<9;
1321         int U = (1<<9)-(128 << 19);
1322         int V = (1<<9)-(128 << 19);
1323         int av_unused A;
1324         int R, G, B;
1325
1326         for (j = 0; j < lumFilterSize; j++) {
1327             Y += lumSrc[j][i] * lumFilter[j];
1328         }
1329         for (j = 0; j < chrFilterSize; j++) {
1330             U += chrUSrc[j][i] * chrFilter[j];
1331             V += chrVSrc[j][i] * chrFilter[j];
1332         }
1333         Y >>= 10;
1334         U >>= 10;
1335         V >>= 10;
1336         if (hasAlpha) {
1337             A = 1 << 18;
1338             for (j = 0; j < lumFilterSize; j++) {
1339                 A += alpSrc[j][i] * lumFilter[j];
1340             }
1341             A >>= 19;
1342             if (A & 0x100)
1343                 A = av_clip_uint8(A);
1344         }
1345         Y -= c->yuv2rgb_y_offset;
1346         Y *= c->yuv2rgb_y_coeff;
1347         Y += 1 << 21;
1348         R = Y + V*c->yuv2rgb_v2r_coeff;
1349         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1350         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1351         if ((R | G | B) & 0xC0000000) {
1352             R = av_clip_uintp2(R, 30);
1353             G = av_clip_uintp2(G, 30);
1354             B = av_clip_uintp2(B, 30);
1355         }
1356
1357         switch(target) {
1358         case PIX_FMT_ARGB:
1359             dest[0] = hasAlpha ? A : 255;
1360             dest[1] = R >> 22;
1361             dest[2] = G >> 22;
1362             dest[3] = B >> 22;
1363             break;
1364         case PIX_FMT_RGB24:
1365             dest[0] = R >> 22;
1366             dest[1] = G >> 22;
1367             dest[2] = B >> 22;
1368             break;
1369         case PIX_FMT_RGBA:
1370             dest[0] = R >> 22;
1371             dest[1] = G >> 22;
1372             dest[2] = B >> 22;
1373             dest[3] = hasAlpha ? A : 255;
1374             break;
1375         case PIX_FMT_ABGR:
1376             dest[0] = hasAlpha ? A : 255;
1377             dest[1] = B >> 22;
1378             dest[2] = G >> 22;
1379             dest[3] = R >> 22;
1380             break;
1381         case PIX_FMT_BGR24:
1382             dest[0] = B >> 22;
1383             dest[1] = G >> 22;
1384             dest[2] = R >> 22;
1385             break;
1386         case PIX_FMT_BGRA:
1387             dest[0] = B >> 22;
1388             dest[1] = G >> 22;
1389             dest[2] = R >> 22;
1390             dest[3] = hasAlpha ? A : 255;
1391             break;
1392         }
1393         dest += step;
1394     }
1395 }
1396
1397 #if CONFIG_SMALL
1398 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1399 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1400 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1401 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1402 #else
1403 #if CONFIG_SWSCALE_ALPHA
1404 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1)
1405 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1)
1406 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1)
1407 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1)
1408 #endif
1409 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0)
1410 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0)
1411 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0)
1412 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0)
1413 #endif
1414 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0)
1415 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0)
1416
1417 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1418                                        int width, int height,
1419                                        int y, uint8_t val)
1420 {
1421     int i;
1422     uint8_t *ptr = plane + stride*y;
1423     for (i=0; i<height; i++) {
1424         memset(ptr, val, width);
1425         ptr += stride;
1426     }
1427 }
1428
1429 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1430
1431 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1432 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1433
1434 static av_always_inline void
1435 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1436                     enum PixelFormat origin)
1437 {
1438     int i;
1439     for (i = 0; i < width; i++) {
1440         unsigned int r_b = input_pixel(&src[i*3+0]);
1441         unsigned int   g = input_pixel(&src[i*3+1]);
1442         unsigned int b_r = input_pixel(&src[i*3+2]);
1443
1444         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1445     }
1446 }
1447
1448 static av_always_inline void
1449 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1450                     const uint16_t *src1, const uint16_t *src2,
1451                     int width, enum PixelFormat origin)
1452 {
1453     int i;
1454     assert(src1==src2);
1455     for (i = 0; i < width; i++) {
1456         int r_b = input_pixel(&src1[i*3+0]);
1457         int   g = input_pixel(&src1[i*3+1]);
1458         int b_r = input_pixel(&src1[i*3+2]);
1459
1460         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1461         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1462     }
1463 }
1464
1465 static av_always_inline void
1466 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1467                           const uint16_t *src1, const uint16_t *src2,
1468                           int width, enum PixelFormat origin)
1469 {
1470     int i;
1471     assert(src1==src2);
1472     for (i = 0; i < width; i++) {
1473         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1474         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1475         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1476
1477         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1478         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1479     }
1480 }
1481
1482 #undef r
1483 #undef b
1484 #undef input_pixel
1485
1486 #define rgb48funcs(pattern, BE_LE, origin) \
1487 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
1488                                     int width, uint32_t *unused) \
1489 { \
1490     const uint16_t *src = (const uint16_t *) _src; \
1491     uint16_t *dst = (uint16_t *) _dst; \
1492     rgb48ToY_c_template(dst, src, width, origin); \
1493 } \
1494  \
1495 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1496                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1497                                     int width, uint32_t *unused) \
1498 { \
1499     const uint16_t *src1 = (const uint16_t *) _src1, \
1500                    *src2 = (const uint16_t *) _src2; \
1501     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1502     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1503 } \
1504  \
1505 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1506                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1507                                     int width, uint32_t *unused) \
1508 { \
1509     const uint16_t *src1 = (const uint16_t *) _src1, \
1510                    *src2 = (const uint16_t *) _src2; \
1511     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1512     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1513 }
1514
1515 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE)
1516 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE)
1517 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE)
1518 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE)
1519
1520 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1521                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1522                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1523
1524 static av_always_inline void
1525 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1526                        int width, enum PixelFormat origin,
1527                        int shr,   int shg,   int shb, int shp,
1528                        int maskr, int maskg, int maskb,
1529                        int rsh,   int gsh,   int bsh, int S)
1530 {
1531     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh;
1532     const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
1533     int i;
1534
1535     for (i = 0; i < width; i++) {
1536         int px = input_pixel(i) >> shp;
1537         int b = (px & maskb) >> shb;
1538         int g = (px & maskg) >> shg;
1539         int r = (px & maskr) >> shr;
1540
1541         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1542     }
1543 }
1544
1545 static av_always_inline void
1546 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1547                         const uint8_t *src, int width,
1548                         enum PixelFormat origin,
1549                         int shr,   int shg,   int shb, int shp,
1550                         int maskr, int maskg, int maskb,
1551                         int rsh,   int gsh,   int bsh, int S)
1552 {
1553     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1554               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh;
1555     const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
1556     int i;
1557
1558     for (i = 0; i < width; i++) {
1559         int px = input_pixel(i) >> shp;
1560         int b = (px & maskb) >> shb;
1561         int g = (px & maskg) >> shg;
1562         int r = (px & maskr) >> shr;
1563
1564         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1565         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1566     }
1567 }
1568
1569 static av_always_inline void
1570 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1571                              const uint8_t *src, int width,
1572                              enum PixelFormat origin,
1573                              int shr,   int shg,   int shb, int shp,
1574                              int maskr, int maskg, int maskb,
1575                              int rsh,   int gsh,   int bsh, int S)
1576 {
1577     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1578               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1579               maskgx = ~(maskr | maskb);
1580     const unsigned rnd = (256U<<(S)) + (1<<(S-6));
1581     int i;
1582
1583     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1584     for (i = 0; i < width; i++) {
1585         int px0 = input_pixel(2 * i + 0) >> shp;
1586         int px1 = input_pixel(2 * i + 1) >> shp;
1587         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1588         int rb = px0 + px1 - g;
1589
1590         b = (rb & maskb) >> shb;
1591         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1592             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1593             g >>= shg;
1594         } else {
1595             g = (g  & maskg) >> shg;
1596         }
1597         r = (rb & maskr) >> shr;
1598
1599         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1600         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1601     }
1602 }
1603
1604 #undef input_pixel
1605
1606 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1607                          maskg, maskb, rsh, gsh, bsh, S) \
1608 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
1609                           int width, uint32_t *unused) \
1610 { \
1611     rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, \
1612                            shr, shg, shb, shp, \
1613                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1614 } \
1615  \
1616 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1617                            const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1618                            int width, uint32_t *unused) \
1619 { \
1620     rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,  \
1621                             shr, shg, shb, shp, \
1622                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1623 } \
1624  \
1625 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1626                                 const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1627                                 int width, uint32_t *unused) \
1628 { \
1629     rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
1630                                  shr, shg, shb, shp, \
1631                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1632 }
1633
1634 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
1635 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
1636 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
1637 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
1638 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
1639 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
1640 rgb16_32_wrapper(PIX_FMT_BGR444LE, bgr12le, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT+4)
1641 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
1642 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
1643 rgb16_32_wrapper(PIX_FMT_RGB444LE, rgb12le, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
1644 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
1645 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
1646 rgb16_32_wrapper(PIX_FMT_BGR444BE, bgr12be, 0, 0,  0, 0,   0x000F, 0x00F0,   0x0F00,  8, 4,  0, RGB2YUV_SHIFT+4)
1647 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
1648 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
1649 rgb16_32_wrapper(PIX_FMT_RGB444BE, rgb12be, 0, 0,  0, 0,   0x0F00, 0x00F0,   0x000F,  0, 4,  8, RGB2YUV_SHIFT+4)
1650
1651 static void gbr24pToUV_half_c(uint16_t *dstU, uint16_t *dstV,
1652                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
1653                          int width, enum PixelFormat origin)
1654 {
1655     int i;
1656     for (i = 0; i < width; i++) {
1657         unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
1658         unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
1659         unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
1660
1661         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1662         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1663     }
1664 }
1665
1666 static void abgrToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1667 {
1668     int i;
1669     for (i=0; i<width; i++) {
1670         dst[i]= src[4*i]<<6;
1671     }
1672 }
1673
1674 static void rgbaToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1675 {
1676     int i;
1677     for (i=0; i<width; i++) {
1678         dst[i]= src[4*i+3]<<6;
1679     }
1680 }
1681
1682 static void palToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
1683 {
1684     int i;
1685     for (i=0; i<width; i++) {
1686         int d= src[i];
1687
1688         dst[i]= (pal[d] >> 24)<<6;
1689     }
1690 }
1691
1692 static void palToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, long width, uint32_t *pal)
1693 {
1694     int i;
1695     for (i=0; i<width; i++) {
1696         int d= src[i];
1697
1698         dst[i]= (pal[d] & 0xFF)<<6;
1699     }
1700 }
1701
1702 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1703                            const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1704                            int width, uint32_t *pal)
1705 {
1706     int i;
1707     assert(src1 == src2);
1708     for (i=0; i<width; i++) {
1709         int p= pal[src1[i]];
1710
1711         dstU[i]= (uint8_t)(p>> 8)<<6;
1712         dstV[i]= (uint8_t)(p>>16)<<6;
1713     }
1714 }
1715
1716 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1717 {
1718     int i, j;
1719     for (i=0; i<width/8; i++) {
1720         int d= ~src[i];
1721         for(j=0; j<8; j++)
1722             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1723     }
1724     if(width&7){
1725         int d= ~src[i];
1726         for(j=0; j<(width&7); j++)
1727             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1728     }
1729 }
1730
1731 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1732 {
1733     int i, j;
1734     for (i=0; i<width/8; i++) {
1735         int d= src[i];
1736         for(j=0; j<8; j++)
1737             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1738     }
1739     if(width&7){
1740         int d= src[i];
1741         for(j=0; j<(width&7); j++)
1742             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1743     }
1744 }
1745
1746 //FIXME yuy2* can read up to 7 samples too much
1747
1748 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1749                       uint32_t *unused)
1750 {
1751     int i;
1752     for (i=0; i<width; i++)
1753         dst[i]= src[2*i];
1754 }
1755
1756 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1757                        const uint8_t *src2, int width, uint32_t *unused)
1758 {
1759     int i;
1760     for (i=0; i<width; i++) {
1761         dstU[i]= src1[4*i + 1];
1762         dstV[i]= src1[4*i + 3];
1763     }
1764     assert(src1 == src2);
1765 }
1766
1767 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1768 {
1769     int i;
1770     const uint16_t *src = (const uint16_t *) _src;
1771     uint16_t *dst = (uint16_t *) _dst;
1772     for (i=0; i<width; i++) {
1773         dst[i] = av_bswap16(src[i]);
1774     }
1775 }
1776
1777 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
1778                         const uint8_t *_src2, int width, uint32_t *unused)
1779 {
1780     int i;
1781     const uint16_t *src1 = (const uint16_t *) _src1,
1782                    *src2 = (const uint16_t *) _src2;
1783     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1784     for (i=0; i<width; i++) {
1785         dstU[i] = av_bswap16(src1[i]);
1786         dstV[i] = av_bswap16(src2[i]);
1787     }
1788 }
1789
1790 /* This is almost identical to the previous, end exists only because
1791  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1792 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1793                       uint32_t *unused)
1794 {
1795     int i;
1796     for (i=0; i<width; i++)
1797         dst[i]= src[2*i+1];
1798 }
1799
1800 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1801                        const uint8_t *src2, int width, uint32_t *unused)
1802 {
1803     int i;
1804     for (i=0; i<width; i++) {
1805         dstU[i]= src1[4*i + 0];
1806         dstV[i]= src1[4*i + 2];
1807     }
1808     assert(src1 == src2);
1809 }
1810
1811 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1812                                         const uint8_t *src, int width)
1813 {
1814     int i;
1815     for (i = 0; i < width; i++) {
1816         dst1[i] = src[2*i+0];
1817         dst2[i] = src[2*i+1];
1818     }
1819 }
1820
1821 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1822                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1823                        int width, uint32_t *unused)
1824 {
1825     nvXXtoUV_c(dstU, dstV, src1, width);
1826 }
1827
1828 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1829                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1830                        int width, uint32_t *unused)
1831 {
1832     nvXXtoUV_c(dstV, dstU, src1, width);
1833 }
1834
1835 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1836
1837 static void bgr24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
1838                        int width, uint32_t *unused)
1839 {
1840     int i;
1841     for (i=0; i<width; i++) {
1842         int b= src[i*3+0];
1843         int g= src[i*3+1];
1844         int r= src[i*3+2];
1845
1846         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1847     }
1848 }
1849
1850 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1851                         const uint8_t *src2, int width, uint32_t *unused)
1852 {
1853     int i;
1854     for (i=0; i<width; i++) {
1855         int b= src1[3*i + 0];
1856         int g= src1[3*i + 1];
1857         int r= src1[3*i + 2];
1858
1859         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1860         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1861     }
1862     assert(src1 == src2);
1863 }
1864
1865 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1866                              const uint8_t *src2, int width, uint32_t *unused)
1867 {
1868     int i;
1869     for (i=0; i<width; i++) {
1870         int b= src1[6*i + 0] + src1[6*i + 3];
1871         int g= src1[6*i + 1] + src1[6*i + 4];
1872         int r= src1[6*i + 2] + src1[6*i + 5];
1873
1874         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1875         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1876     }
1877     assert(src1 == src2);
1878 }
1879
1880 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
1881                        uint32_t *unused)
1882 {
1883     int i;
1884     for (i=0; i<width; i++) {
1885         int r= src[i*3+0];
1886         int g= src[i*3+1];
1887         int b= src[i*3+2];
1888
1889         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1890     }
1891 }
1892
1893 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1894                         const uint8_t *src2, int width, uint32_t *unused)
1895 {
1896     int i;
1897     assert(src1==src2);
1898     for (i=0; i<width; i++) {
1899         int r= src1[3*i + 0];
1900         int g= src1[3*i + 1];
1901         int b= src1[3*i + 2];
1902
1903         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1904         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1905     }
1906 }
1907
1908 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1909                                     const uint8_t *src2, int width, uint32_t *unused)
1910 {
1911     int i;
1912     assert(src1==src2);
1913     for (i=0; i<width; i++) {
1914         int r= src1[6*i + 0] + src1[6*i + 3];
1915         int g= src1[6*i + 1] + src1[6*i + 4];
1916         int b= src1[6*i + 2] + src1[6*i + 5];
1917
1918         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1919         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1920     }
1921 }
1922
1923 static void planar_rgb_to_y(uint16_t *dst, const uint8_t *src[4], int width)
1924 {
1925     int i;
1926     for (i = 0; i < width; i++) {
1927         int g = src[0][i];
1928         int b = src[1][i];
1929         int r = src[2][i];
1930
1931         dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1932     }
1933 }
1934
1935 static void planar_rgb16le_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1936 {
1937     int i;
1938     const uint16_t **src = (const uint16_t **) _src;
1939     uint16_t *dst = (uint16_t *) _dst;
1940     for (i = 0; i < width; i++) {
1941         int g = AV_RL16(src[0] + i);
1942         int b = AV_RL16(src[1] + i);
1943         int r = AV_RL16(src[2] + i);
1944
1945         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1946     }
1947 }
1948
1949 static void planar_rgb16be_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1950 {
1951     int i;
1952     const uint16_t **src = (const uint16_t **) _src;
1953     uint16_t *dst = (uint16_t *) _dst;
1954     for (i = 0; i < width; i++) {
1955         int g = AV_RB16(src[0] + i);
1956         int b = AV_RB16(src[1] + i);
1957         int r = AV_RB16(src[2] + i);
1958
1959         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1960     }
1961 }
1962
1963 static void planar_rgb_to_uv(uint16_t *dstU, uint16_t *dstV, const uint8_t *src[4], int width)
1964 {
1965     int i;
1966     for (i = 0; i < width; i++) {
1967         int g = src[0][i];
1968         int b = src[1][i];
1969         int r = src[2][i];
1970
1971         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1972         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1973     }
1974 }
1975
1976 static void planar_rgb16le_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
1977 {
1978     int i;
1979     const uint16_t **src = (const uint16_t **) _src;
1980     uint16_t *dstU = (uint16_t *) _dstU;
1981     uint16_t *dstV = (uint16_t *) _dstV;
1982     for (i = 0; i < width; i++) {
1983         int g = AV_RL16(src[0] + i);
1984         int b = AV_RL16(src[1] + i);
1985         int r = AV_RL16(src[2] + i);
1986
1987         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1988         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
1989     }
1990 }
1991
1992 static void planar_rgb16be_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
1993 {
1994     int i;
1995     const uint16_t **src = (const uint16_t **) _src;
1996     uint16_t *dstU = (uint16_t *) _dstU;
1997     uint16_t *dstV = (uint16_t *) _dstV;
1998     for (i = 0; i < width; i++) {
1999         int g = AV_RB16(src[0] + i);
2000         int b = AV_RB16(src[1] + i);
2001         int r = AV_RB16(src[2] + i);
2002
2003         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2004         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2005     }
2006 }
2007
2008 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
2009                            const int16_t *filter,
2010                            const int16_t *filterPos, int filterSize)
2011 {
2012     int i;
2013     int32_t *dst = (int32_t *) _dst;
2014     const uint16_t *src = (const uint16_t *) _src;
2015     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2016     int sh = bits - 4;
2017
2018     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2019         sh= 9;
2020
2021     for (i = 0; i < dstW; i++) {
2022         int j;
2023         int srcPos = filterPos[i];
2024         int val = 0;
2025
2026         for (j = 0; j < filterSize; j++) {
2027             val += src[srcPos + j] * filter[filterSize * i + j];
2028         }
2029         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
2030         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
2031     }
2032 }
2033
2034 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
2035                            const int16_t *filter,
2036                            const int16_t *filterPos, int filterSize)
2037 {
2038     int i;
2039     const uint16_t *src = (const uint16_t *) _src;
2040     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2041
2042     if(sh<15)
2043         sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2044
2045     for (i = 0; i < dstW; i++) {
2046         int j;
2047         int srcPos = filterPos[i];
2048         int val = 0;
2049
2050         for (j = 0; j < filterSize; j++) {
2051             val += src[srcPos + j] * filter[filterSize * i + j];
2052         }
2053         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2054         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2055     }
2056 }
2057
2058 // bilinear / bicubic scaling
2059 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2060                           const int16_t *filter, const int16_t *filterPos,
2061                           int filterSize)
2062 {
2063     int i;
2064     for (i=0; i<dstW; i++) {
2065         int j;
2066         int srcPos= filterPos[i];
2067         int val=0;
2068         for (j=0; j<filterSize; j++) {
2069             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2070         }
2071         //filter += hFilterSize;
2072         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2073         //dst[i] = val>>7;
2074     }
2075 }
2076
2077 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
2078                           const int16_t *filter, const int16_t *filterPos,
2079                           int filterSize)
2080 {
2081     int i;
2082     int32_t *dst = (int32_t *) _dst;
2083     for (i=0; i<dstW; i++) {
2084         int j;
2085         int srcPos= filterPos[i];
2086         int val=0;
2087         for (j=0; j<filterSize; j++) {
2088             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2089         }
2090         //filter += hFilterSize;
2091         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
2092         //dst[i] = val>>7;
2093     }
2094 }
2095
2096 //FIXME all pal and rgb srcFormats could do this convertion as well
2097 //FIXME all scalers more complex than bilinear could do half of this transform
2098 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2099 {
2100     int i;
2101     for (i = 0; i < width; i++) {
2102         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2103         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2104     }
2105 }
2106 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2107 {
2108     int i;
2109     for (i = 0; i < width; i++) {
2110         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2111         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2112     }
2113 }
2114 static void lumRangeToJpeg_c(int16_t *dst, int width)
2115 {
2116     int i;
2117     for (i = 0; i < width; i++)
2118         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2119 }
2120 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2121 {
2122     int i;
2123     for (i = 0; i < width; i++)
2124         dst[i] = (dst[i]*14071 + 33561947)>>14;
2125 }
2126
2127 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2128 {
2129     int i;
2130     int32_t *dstU = (int32_t *) _dstU;
2131     int32_t *dstV = (int32_t *) _dstV;
2132     for (i = 0; i < width; i++) {
2133         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2134         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2135     }
2136 }
2137 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2138 {
2139     int i;
2140     int32_t *dstU = (int32_t *) _dstU;
2141     int32_t *dstV = (int32_t *) _dstV;
2142     for (i = 0; i < width; i++) {
2143         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2144         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2145     }
2146 }
2147 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2148 {
2149     int i;
2150     int32_t *dst = (int32_t *) _dst;
2151     for (i = 0; i < width; i++)
2152         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2153 }
2154 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2155 {
2156     int i;
2157     int32_t *dst = (int32_t *) _dst;
2158     for (i = 0; i < width; i++)
2159         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2160 }
2161
2162 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2163                            const uint8_t *src, int srcW, int xInc)
2164 {
2165     int i;
2166     unsigned int xpos=0;
2167     for (i=0;i<dstWidth;i++) {
2168         register unsigned int xx=xpos>>16;
2169         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2170         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2171         xpos+=xInc;
2172     }
2173     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2174         dst[i] = src[srcW-1]*128;
2175 }
2176
2177 // *** horizontal scale Y line to temp buffer
2178 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2179                                      const uint8_t *src_in[4], int srcW, int xInc,
2180                                      const int16_t *hLumFilter,
2181                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2182                                      uint8_t *formatConvBuffer,
2183                                      uint32_t *pal, int isAlpha)
2184 {
2185     void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2186     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2187     const uint8_t *src = src_in[isAlpha ? 3 : 0];
2188
2189     if (toYV12) {
2190         toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
2191         src= formatConvBuffer;
2192     } else if (c->readLumPlanar && !isAlpha) {
2193         c->readLumPlanar(formatConvBuffer, src_in, srcW);
2194         src = formatConvBuffer;
2195     }
2196
2197     if (!c->hyscale_fast) {
2198         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2199     } else { // fast bilinear upscale / crap downscale
2200         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2201     }
2202
2203     if (convertRange)
2204         convertRange(dst, dstWidth);
2205 }
2206
2207 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2208                            int dstWidth, const uint8_t *src1,
2209                            const uint8_t *src2, int srcW, int xInc)
2210 {
2211     int i;
2212     unsigned int xpos=0;
2213     for (i=0;i<dstWidth;i++) {
2214         register unsigned int xx=xpos>>16;
2215         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2216         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2217         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2218         xpos+=xInc;
2219     }
2220     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2221         dst1[i] = src1[srcW-1]*128;
2222         dst2[i] = src2[srcW-1]*128;
2223     }
2224 }
2225
2226 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2227                                      const uint8_t *src_in[4],
2228                                      int srcW, int xInc, const int16_t *hChrFilter,
2229                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2230                                      uint8_t *formatConvBuffer, uint32_t *pal)
2231 {
2232     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
2233     if (c->chrToYV12) {
2234         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2235         c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
2236         src1= formatConvBuffer;
2237         src2= buf2;
2238     } else if (c->readChrPlanar) {
2239         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2240         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
2241         src1= formatConvBuffer;
2242         src2= buf2;
2243     }
2244
2245     if (!c->hcscale_fast) {
2246         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2247         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2248     } else { // fast bilinear upscale / crap downscale
2249         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2250     }
2251
2252     if (c->chrConvertRange)
2253         c->chrConvertRange(dst1, dst2, dstWidth);
2254 }
2255
2256 static av_always_inline void
2257 find_c_packed_planar_out_funcs(SwsContext *c,
2258                                yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX,
2259                                yuv2interleavedX_fn *yuv2nv12cX,
2260                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2261                                yuv2packedX_fn *yuv2packedX)
2262 {
2263     enum PixelFormat dstFormat = c->dstFormat;
2264
2265     if (is16BPS(dstFormat)) {
2266         *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c  : yuv2planeX_16LE_c;
2267         *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c  : yuv2plane1_16LE_c;
2268     } else if (is9_OR_10BPS(dstFormat)) {
2269         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2270             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c  : yuv2planeX_9LE_c;
2271             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c  : yuv2plane1_9LE_c;
2272         } else {
2273             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c  : yuv2planeX_10LE_c;
2274             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c  : yuv2plane1_10LE_c;
2275         }
2276     } else {
2277         *yuv2plane1 = yuv2plane1_8_c;
2278         *yuv2planeX = yuv2planeX_8_c;
2279         if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)
2280             *yuv2nv12cX = yuv2nv12cX_c;
2281     }
2282
2283     if(c->flags & SWS_FULL_CHR_H_INT) {
2284         switch (dstFormat) {
2285             case PIX_FMT_RGBA:
2286 #if CONFIG_SMALL
2287                 *yuv2packedX = yuv2rgba32_full_X_c;
2288 #else
2289 #if CONFIG_SWSCALE_ALPHA
2290                 if (c->alpPixBuf) {
2291                     *yuv2packedX = yuv2rgba32_full_X_c;
2292                 } else
2293 #endif /* CONFIG_SWSCALE_ALPHA */
2294                 {
2295                     *yuv2packedX = yuv2rgbx32_full_X_c;
2296                 }
2297 #endif /* !CONFIG_SMALL */
2298                 break;
2299             case PIX_FMT_ARGB:
2300 #if CONFIG_SMALL
2301                 *yuv2packedX = yuv2argb32_full_X_c;
2302 #else
2303 #if CONFIG_SWSCALE_ALPHA
2304                 if (c->alpPixBuf) {
2305                     *yuv2packedX = yuv2argb32_full_X_c;
2306                 } else
2307 #endif /* CONFIG_SWSCALE_ALPHA */
2308                 {
2309                     *yuv2packedX = yuv2xrgb32_full_X_c;
2310                 }
2311 #endif /* !CONFIG_SMALL */
2312                 break;
2313             case PIX_FMT_BGRA:
2314 #if CONFIG_SMALL
2315                 *yuv2packedX = yuv2bgra32_full_X_c;
2316 #else
2317 #if CONFIG_SWSCALE_ALPHA
2318                 if (c->alpPixBuf) {
2319                     *yuv2packedX = yuv2bgra32_full_X_c;
2320                 } else
2321 #endif /* CONFIG_SWSCALE_ALPHA */
2322                 {
2323                     *yuv2packedX = yuv2bgrx32_full_X_c;
2324                 }
2325 #endif /* !CONFIG_SMALL */
2326                 break;
2327             case PIX_FMT_ABGR:
2328 #if CONFIG_SMALL
2329                 *yuv2packedX = yuv2abgr32_full_X_c;
2330 #else
2331 #if CONFIG_SWSCALE_ALPHA
2332                 if (c->alpPixBuf) {
2333                     *yuv2packedX = yuv2abgr32_full_X_c;
2334                 } else
2335 #endif /* CONFIG_SWSCALE_ALPHA */
2336                 {
2337                     *yuv2packedX = yuv2xbgr32_full_X_c;
2338                 }
2339 #endif /* !CONFIG_SMALL */
2340                 break;
2341             case PIX_FMT_RGB24:
2342             *yuv2packedX = yuv2rgb24_full_X_c;
2343             break;
2344         case PIX_FMT_BGR24:
2345             *yuv2packedX = yuv2bgr24_full_X_c;
2346             break;
2347         }
2348         if(!*yuv2packedX)
2349             goto YUV_PACKED;
2350     } else {
2351         YUV_PACKED:
2352         switch (dstFormat) {
2353         case PIX_FMT_RGB48LE:
2354             *yuv2packed1 = yuv2rgb48le_1_c;
2355             *yuv2packed2 = yuv2rgb48le_2_c;
2356             *yuv2packedX = yuv2rgb48le_X_c;
2357             break;
2358         case PIX_FMT_RGB48BE:
2359             *yuv2packed1 = yuv2rgb48be_1_c;
2360             *yuv2packed2 = yuv2rgb48be_2_c;
2361             *yuv2packedX = yuv2rgb48be_X_c;
2362             break;
2363         case PIX_FMT_BGR48LE:
2364             *yuv2packed1 = yuv2bgr48le_1_c;
2365             *yuv2packed2 = yuv2bgr48le_2_c;
2366             *yuv2packedX = yuv2bgr48le_X_c;
2367             break;
2368         case PIX_FMT_BGR48BE:
2369             *yuv2packed1 = yuv2bgr48be_1_c;
2370             *yuv2packed2 = yuv2bgr48be_2_c;
2371             *yuv2packedX = yuv2bgr48be_X_c;
2372             break;
2373         case PIX_FMT_RGB32:
2374         case PIX_FMT_BGR32:
2375 #if CONFIG_SMALL
2376             *yuv2packed1 = yuv2rgb32_1_c;
2377             *yuv2packed2 = yuv2rgb32_2_c;
2378             *yuv2packedX = yuv2rgb32_X_c;
2379 #else
2380 #if CONFIG_SWSCALE_ALPHA
2381                 if (c->alpPixBuf) {
2382                     *yuv2packed1 = yuv2rgba32_1_c;
2383                     *yuv2packed2 = yuv2rgba32_2_c;
2384                     *yuv2packedX = yuv2rgba32_X_c;
2385                 } else
2386 #endif /* CONFIG_SWSCALE_ALPHA */
2387                 {
2388                     *yuv2packed1 = yuv2rgbx32_1_c;
2389                     *yuv2packed2 = yuv2rgbx32_2_c;
2390                     *yuv2packedX = yuv2rgbx32_X_c;
2391                 }
2392 #endif /* !CONFIG_SMALL */
2393             break;
2394         case PIX_FMT_RGB32_1:
2395         case PIX_FMT_BGR32_1:
2396 #if CONFIG_SMALL
2397                 *yuv2packed1 = yuv2rgb32_1_1_c;
2398                 *yuv2packed2 = yuv2rgb32_1_2_c;
2399                 *yuv2packedX = yuv2rgb32_1_X_c;
2400 #else
2401 #if CONFIG_SWSCALE_ALPHA
2402                 if (c->alpPixBuf) {
2403                     *yuv2packed1 = yuv2rgba32_1_1_c;
2404                     *yuv2packed2 = yuv2rgba32_1_2_c;
2405                     *yuv2packedX = yuv2rgba32_1_X_c;
2406                 } else
2407 #endif /* CONFIG_SWSCALE_ALPHA */
2408                 {
2409                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2410                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2411                     *yuv2packedX = yuv2rgbx32_1_X_c;
2412                 }
2413 #endif /* !CONFIG_SMALL */
2414                 break;
2415         case PIX_FMT_RGB24:
2416             *yuv2packed1 = yuv2rgb24_1_c;
2417             *yuv2packed2 = yuv2rgb24_2_c;
2418             *yuv2packedX = yuv2rgb24_X_c;
2419             break;
2420         case PIX_FMT_BGR24:
2421             *yuv2packed1 = yuv2bgr24_1_c;
2422             *yuv2packed2 = yuv2bgr24_2_c;
2423             *yuv2packedX = yuv2bgr24_X_c;
2424             break;
2425         case PIX_FMT_RGB565LE:
2426         case PIX_FMT_RGB565BE:
2427         case PIX_FMT_BGR565LE:
2428         case PIX_FMT_BGR565BE:
2429             *yuv2packed1 = yuv2rgb16_1_c;
2430             *yuv2packed2 = yuv2rgb16_2_c;
2431             *yuv2packedX = yuv2rgb16_X_c;
2432             break;
2433         case PIX_FMT_RGB555LE:
2434         case PIX_FMT_RGB555BE:
2435         case PIX_FMT_BGR555LE:
2436         case PIX_FMT_BGR555BE:
2437             *yuv2packed1 = yuv2rgb15_1_c;
2438             *yuv2packed2 = yuv2rgb15_2_c;
2439             *yuv2packedX = yuv2rgb15_X_c;
2440             break;
2441         case PIX_FMT_RGB444LE:
2442         case PIX_FMT_RGB444BE:
2443         case PIX_FMT_BGR444LE:
2444         case PIX_FMT_BGR444BE:
2445             *yuv2packed1 = yuv2rgb12_1_c;
2446             *yuv2packed2 = yuv2rgb12_2_c;
2447             *yuv2packedX = yuv2rgb12_X_c;
2448             break;
2449         case PIX_FMT_RGB8:
2450         case PIX_FMT_BGR8:
2451             *yuv2packed1 = yuv2rgb8_1_c;
2452             *yuv2packed2 = yuv2rgb8_2_c;
2453             *yuv2packedX = yuv2rgb8_X_c;
2454             break;
2455         case PIX_FMT_RGB4:
2456         case PIX_FMT_BGR4:
2457             *yuv2packed1 = yuv2rgb4_1_c;
2458             *yuv2packed2 = yuv2rgb4_2_c;
2459             *yuv2packedX = yuv2rgb4_X_c;
2460             break;
2461         case PIX_FMT_RGB4_BYTE:
2462         case PIX_FMT_BGR4_BYTE:
2463             *yuv2packed1 = yuv2rgb4b_1_c;
2464             *yuv2packed2 = yuv2rgb4b_2_c;
2465             *yuv2packedX = yuv2rgb4b_X_c;
2466             break;
2467         }
2468     }
2469     switch (dstFormat) {
2470     case PIX_FMT_GRAY16BE:
2471         *yuv2packed1 = yuv2gray16BE_1_c;
2472         *yuv2packed2 = yuv2gray16BE_2_c;
2473         *yuv2packedX = yuv2gray16BE_X_c;
2474         break;
2475     case PIX_FMT_GRAY16LE:
2476         *yuv2packed1 = yuv2gray16LE_1_c;
2477         *yuv2packed2 = yuv2gray16LE_2_c;
2478         *yuv2packedX = yuv2gray16LE_X_c;
2479         break;
2480     case PIX_FMT_MONOWHITE:
2481         *yuv2packed1 = yuv2monowhite_1_c;
2482         *yuv2packed2 = yuv2monowhite_2_c;
2483         *yuv2packedX = yuv2monowhite_X_c;
2484         break;
2485     case PIX_FMT_MONOBLACK:
2486         *yuv2packed1 = yuv2monoblack_1_c;
2487         *yuv2packed2 = yuv2monoblack_2_c;
2488         *yuv2packedX = yuv2monoblack_X_c;
2489         break;
2490     case PIX_FMT_YUYV422:
2491         *yuv2packed1 = yuv2yuyv422_1_c;
2492         *yuv2packed2 = yuv2yuyv422_2_c;
2493         *yuv2packedX = yuv2yuyv422_X_c;
2494         break;
2495     case PIX_FMT_UYVY422:
2496         *yuv2packed1 = yuv2uyvy422_1_c;
2497         *yuv2packed2 = yuv2uyvy422_2_c;
2498         *yuv2packedX = yuv2uyvy422_X_c;
2499         break;
2500     }
2501 }
2502
2503 #define DEBUG_SWSCALE_BUFFERS 0
2504 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2505
2506 static int swScale(SwsContext *c, const uint8_t* src[],
2507                    int srcStride[], int srcSliceY,
2508                    int srcSliceH, uint8_t* dst[], int dstStride[])
2509 {
2510     /* load a few things into local vars to make the code more readable? and faster */
2511     const int srcW= c->srcW;
2512     const int dstW= c->dstW;
2513     const int dstH= c->dstH;
2514     const int chrDstW= c->chrDstW;
2515     const int chrSrcW= c->chrSrcW;
2516     const int lumXInc= c->lumXInc;
2517     const int chrXInc= c->chrXInc;
2518     const enum PixelFormat dstFormat= c->dstFormat;
2519     const int flags= c->flags;
2520     int16_t *vLumFilterPos= c->vLumFilterPos;
2521     int16_t *vChrFilterPos= c->vChrFilterPos;
2522     int16_t *hLumFilterPos= c->hLumFilterPos;
2523     int16_t *hChrFilterPos= c->hChrFilterPos;
2524     int16_t *hLumFilter= c->hLumFilter;
2525     int16_t *hChrFilter= c->hChrFilter;
2526     int32_t *lumMmxFilter= c->lumMmxFilter;
2527     int32_t *chrMmxFilter= c->chrMmxFilter;
2528     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2529     const int vLumFilterSize= c->vLumFilterSize;
2530     const int vChrFilterSize= c->vChrFilterSize;
2531     const int hLumFilterSize= c->hLumFilterSize;
2532     const int hChrFilterSize= c->hChrFilterSize;
2533     int16_t **lumPixBuf= c->lumPixBuf;
2534     int16_t **chrUPixBuf= c->chrUPixBuf;
2535     int16_t **chrVPixBuf= c->chrVPixBuf;
2536     int16_t **alpPixBuf= c->alpPixBuf;
2537     const int vLumBufSize= c->vLumBufSize;
2538     const int vChrBufSize= c->vChrBufSize;
2539     uint8_t *formatConvBuffer= c->formatConvBuffer;
2540     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2541     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2542     int lastDstY;
2543     uint32_t *pal=c->pal_yuv;
2544     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2545
2546     yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
2547     yuv2planarX_fn yuv2planeX = c->yuv2planeX;
2548     yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
2549     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2550     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2551     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2552
2553     /* vars which will change and which we need to store back in the context */
2554     int dstY= c->dstY;
2555     int lumBufIndex= c->lumBufIndex;
2556     int chrBufIndex= c->chrBufIndex;
2557     int lastInLumBuf= c->lastInLumBuf;
2558     int lastInChrBuf= c->lastInChrBuf;
2559
2560     if (isPacked(c->srcFormat)) {
2561         src[0]=
2562         src[1]=
2563         src[2]=
2564         src[3]= src[0];
2565         srcStride[0]=
2566         srcStride[1]=
2567         srcStride[2]=
2568         srcStride[3]= srcStride[0];
2569     }
2570     srcStride[1]<<= c->vChrDrop;
2571     srcStride[2]<<= c->vChrDrop;
2572
2573     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2574                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2575                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2576     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2577                    srcSliceY,    srcSliceH,    dstY,    dstH);
2578     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2579                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2580
2581     if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 || dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
2582         static int warnedAlready=0; //FIXME move this into the context perhaps
2583         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2584             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2585                    "         ->cannot do aligned memory accesses anymore\n");
2586             warnedAlready=1;
2587         }
2588     }
2589
2590     if ((int)dst[0]%16 || (int)dst[1]%16 || (int)dst[2]%16 || (int)src[0]%16 || (int)src[1]%16 || (int)src[2]%16
2591         || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
2592         || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
2593     ) {
2594         static int warnedAlready=0;
2595         int cpu_flags = av_get_cpu_flags();
2596         if (HAVE_MMX2 && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
2597             av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
2598             warnedAlready=1;
2599         }
2600     }
2601
2602     /* Note the user might start scaling the picture in the middle so this
2603        will not get executed. This is not really intended but works
2604        currently, so people might do it. */
2605     if (srcSliceY ==0) {
2606         lumBufIndex=-1;
2607         chrBufIndex=-1;
2608         dstY=0;
2609         lastInLumBuf= -1;
2610         lastInChrBuf= -1;
2611     }
2612
2613     if (!should_dither) {
2614         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2615     }
2616     lastDstY= dstY;
2617
2618     for (;dstY < dstH; dstY++) {
2619         const int chrDstY= dstY>>c->chrDstVSubSample;
2620         uint8_t *dest[4] = {
2621             dst[0] + dstStride[0] * dstY,
2622             dst[1] + dstStride[1] * chrDstY,
2623             dst[2] + dstStride[2] * chrDstY,
2624             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2625         };
2626         int use_mmx_vfilter= c->use_mmx_vfilter;
2627
2628         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2629         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2630         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2631         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2632         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2633         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2634         int enough_lines;
2635
2636         //handle holes (FAST_BILINEAR & weird filters)
2637         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2638         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2639         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2640         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2641
2642         DEBUG_BUFFERS("dstY: %d\n", dstY);
2643         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2644                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2645         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2646                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2647
2648         // Do we have enough lines in this slice to output the dstY line
2649         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2650
2651         if (!enough_lines) {
2652             lastLumSrcY = srcSliceY + srcSliceH - 1;
2653             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2654             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2655                                             lastLumSrcY, lastChrSrcY);
2656         }
2657
2658         //Do horizontal scaling
2659         while(lastInLumBuf < lastLumSrcY) {
2660             const uint8_t *src1[4] = {
2661                 src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
2662                 src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
2663                 src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
2664                 src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
2665             };
2666             lumBufIndex++;
2667             assert(lumBufIndex < 2*vLumBufSize);
2668             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2669             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2670             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2671                     hLumFilter, hLumFilterPos, hLumFilterSize,
2672                     formatConvBuffer,
2673                     pal, 0);
2674             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2675                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src1, srcW,
2676                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2677                         formatConvBuffer,
2678                         pal, 1);
2679             lastInLumBuf++;
2680             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2681                                lumBufIndex,    lastInLumBuf);
2682         }
2683         while(lastInChrBuf < lastChrSrcY) {
2684             const uint8_t *src1[4] = {
2685                 src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
2686                 src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
2687                 src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
2688                 src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
2689             };
2690             chrBufIndex++;
2691             assert(chrBufIndex < 2*vChrBufSize);
2692             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2693             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2694             //FIXME replace parameters through context struct (some at least)
2695
2696             if (c->needs_hcscale)
2697                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2698                           chrDstW, src1, chrSrcW, chrXInc,
2699                           hChrFilter, hChrFilterPos, hChrFilterSize,
2700                           formatConvBuffer, pal);
2701             lastInChrBuf++;
2702             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2703                                chrBufIndex,    lastInChrBuf);
2704         }
2705         //wrap buf index around to stay inside the ring buffer
2706         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2707         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2708         if (!enough_lines)
2709             break; //we can't output a dstY line so let's try with the next slice
2710
2711 #if HAVE_MMX
2712         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2713 #endif
2714         if (should_dither) {
2715             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2716             c->lumDither8 = dither_8x8_128[dstY & 7];
2717         }
2718         if (dstY >= dstH-2) {
2719             // hmm looks like we can't use MMX here without overwriting this array's tail
2720             find_c_packed_planar_out_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
2721                                            &yuv2packed1, &yuv2packed2, &yuv2packedX);
2722             use_mmx_vfilter= 0;
2723         }
2724
2725         {
2726             const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2727             const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2728             const int16_t **chrVSrcPtr= (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2729             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2730             int16_t *vLumFilter= c->vLumFilter;
2731             int16_t *vChrFilter= c->vChrFilter;
2732
2733             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2734                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2735
2736                 vLumFilter +=    dstY * vLumFilterSize;
2737                 vChrFilter += chrDstY * vChrFilterSize;
2738
2739                 av_assert0(use_mmx_vfilter != (
2740                                yuv2planeX == yuv2planeX_10BE_c
2741                             || yuv2planeX == yuv2planeX_10LE_c
2742                             || yuv2planeX == yuv2planeX_9BE_c
2743                             || yuv2planeX == yuv2planeX_9LE_c
2744                             || yuv2planeX == yuv2planeX_16BE_c
2745                             || yuv2planeX == yuv2planeX_16LE_c
2746                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
2747
2748                 if(use_mmx_vfilter){
2749                     vLumFilter= c->lumMmxFilter;
2750                     vChrFilter= c->chrMmxFilter;
2751                 }
2752
2753                 if (vLumFilterSize == 1) {
2754                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
2755                 } else {
2756                     yuv2planeX(vLumFilter, vLumFilterSize,
2757                                lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
2758                 }
2759
2760                 if (!((dstY&chrSkipMask) || isGray(dstFormat))) {
2761                     if (yuv2nv12cX) {
2762                         yuv2nv12cX(c, vChrFilter, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
2763                     } else if (vChrFilterSize == 1) {
2764                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
2765                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
2766                     } else {
2767                         yuv2planeX(vChrFilter, vChrFilterSize,
2768                                    chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
2769                         yuv2planeX(vChrFilter, vChrFilterSize,
2770                                    chrVSrcPtr, dest[2], chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
2771                     }
2772                 }
2773
2774                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2775                     if(use_mmx_vfilter){
2776                         vLumFilter= c->alpMmxFilter;
2777                     }
2778                     if (vLumFilterSize == 1) {
2779                         yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0);
2780                     } else {
2781                         yuv2planeX(vLumFilter, vLumFilterSize,
2782                                    alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
2783                     }
2784                 }
2785             } else {
2786                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2787                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2788                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2789                     int chrAlpha = vChrFilter[2 * dstY + 1];
2790                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2791                                 alpPixBuf ? *alpSrcPtr : NULL,
2792                                 dest[0], dstW, chrAlpha, dstY);
2793                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2794                     int lumAlpha = vLumFilter[2 * dstY + 1];
2795                     int chrAlpha = vChrFilter[2 * dstY + 1];
2796                     lumMmxFilter[2] =
2797                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2798                     chrMmxFilter[2] =
2799                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2800                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2801                                 alpPixBuf ? alpSrcPtr : NULL,
2802                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2803                 } else { //general RGB
2804                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2805                                 lumSrcPtr, vLumFilterSize,
2806                                 vChrFilter + dstY * vChrFilterSize,
2807                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2808                                 alpSrcPtr, dest[0], dstW, dstY);
2809                 }
2810             }
2811         }
2812     }
2813
2814     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2815         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2816
2817 #if HAVE_MMX2
2818     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2819         __asm__ volatile("sfence":::"memory");
2820 #endif
2821     emms_c();
2822
2823     /* store changed local vars back in the context */
2824     c->dstY= dstY;
2825     c->lumBufIndex= lumBufIndex;
2826     c->chrBufIndex= chrBufIndex;
2827     c->lastInLumBuf= lastInLumBuf;
2828     c->lastInChrBuf= lastInChrBuf;
2829
2830     return dstY - lastDstY;
2831 }
2832
2833 static av_cold void sws_init_swScale_c(SwsContext *c)
2834 {
2835     enum PixelFormat srcFormat = c->srcFormat;
2836
2837     find_c_packed_planar_out_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
2838                                    &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2,
2839                                    &c->yuv2packedX);
2840
2841     c->chrToYV12 = NULL;
2842     switch(srcFormat) {
2843         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2844         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2845         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2846         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2847         case PIX_FMT_RGB8     :
2848         case PIX_FMT_BGR8     :
2849         case PIX_FMT_PAL8     :
2850         case PIX_FMT_BGR4_BYTE:
2851         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2852         case PIX_FMT_GBRP9LE:
2853         case PIX_FMT_GBRP10LE:
2854         case PIX_FMT_GBRP16LE:  c->readChrPlanar = planar_rgb16le_to_uv; break;
2855         case PIX_FMT_GBRP9BE:
2856         case PIX_FMT_GBRP10BE:
2857         case PIX_FMT_GBRP16BE:  c->readChrPlanar = planar_rgb16be_to_uv; break;
2858         case PIX_FMT_GBRP:      c->readChrPlanar = planar_rgb_to_uv; break;
2859 #if HAVE_BIGENDIAN
2860         case PIX_FMT_YUV444P9LE:
2861         case PIX_FMT_YUV422P9LE:
2862         case PIX_FMT_YUV420P9LE:
2863         case PIX_FMT_YUV422P10LE:
2864         case PIX_FMT_YUV420P10LE:
2865         case PIX_FMT_YUV444P10LE:
2866         case PIX_FMT_YUV420P16LE:
2867         case PIX_FMT_YUV422P16LE:
2868         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2869 #else
2870         case PIX_FMT_YUV444P9BE:
2871         case PIX_FMT_YUV422P9BE:
2872         case PIX_FMT_YUV420P9BE:
2873         case PIX_FMT_YUV444P10BE:
2874         case PIX_FMT_YUV422P10BE:
2875         case PIX_FMT_YUV420P10BE:
2876         case PIX_FMT_YUV420P16BE:
2877         case PIX_FMT_YUV422P16BE:
2878         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2879 #endif
2880     }
2881     if (c->chrSrcHSubSample) {
2882         switch(srcFormat) {
2883         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2884         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2885         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2886         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2887         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2888         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2889         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2890         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2891         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2892         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2893         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2894         case PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_half_c; break;
2895         case PIX_FMT_BGR444BE: c->chrToYV12 = bgr12beToUV_half_c; break;
2896         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2897         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2898         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2899         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2900         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2901         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2902         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2903         case PIX_FMT_GBR24P  : c->chrToYV12 = gbr24pToUV_half_c;  break;
2904         case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_half_c; break;
2905         case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_half_c; break;
2906         }
2907     } else {
2908         switch(srcFormat) {
2909         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2910         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2911         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2912         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2913         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2914         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2915         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2916         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2917         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2918         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2919         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2920         case PIX_FMT_BGR444LE: c->chrToYV12 = bgr12leToUV_c; break;
2921         case PIX_FMT_BGR444BE: c->chrToYV12 = bgr12beToUV_c; break;
2922         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2923         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2924         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2925         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2926         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2927         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2928         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2929         case PIX_FMT_RGB444LE: c->chrToYV12 = rgb12leToUV_c; break;
2930         case PIX_FMT_RGB444BE: c->chrToYV12 = rgb12beToUV_c; break;
2931         }
2932     }
2933
2934     c->lumToYV12 = NULL;
2935     c->alpToYV12 = NULL;
2936     switch (srcFormat) {
2937     case PIX_FMT_GBRP9LE:
2938     case PIX_FMT_GBRP10LE:
2939     case PIX_FMT_GBRP16LE: c->readLumPlanar = planar_rgb16le_to_y; break;
2940     case PIX_FMT_GBRP9BE:
2941     case PIX_FMT_GBRP10BE:
2942     case PIX_FMT_GBRP16BE: c->readLumPlanar = planar_rgb16be_to_y; break;
2943     case PIX_FMT_GBRP:     c->readLumPlanar = planar_rgb_to_y; break;
2944 #if HAVE_BIGENDIAN
2945     case PIX_FMT_YUV444P9LE:
2946     case PIX_FMT_YUV422P9LE:
2947     case PIX_FMT_YUV420P9LE:
2948     case PIX_FMT_YUV422P10LE:
2949     case PIX_FMT_YUV420P10LE:
2950     case PIX_FMT_YUV444P10LE:
2951     case PIX_FMT_YUV420P16LE:
2952     case PIX_FMT_YUV422P16LE:
2953     case PIX_FMT_YUV444P16LE:
2954     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2955 #else
2956     case PIX_FMT_YUV444P9BE:
2957     case PIX_FMT_YUV422P9BE:
2958     case PIX_FMT_YUV420P9BE:
2959     case PIX_FMT_YUV444P10BE:
2960     case PIX_FMT_YUV422P10BE:
2961     case PIX_FMT_YUV420P10BE:
2962     case PIX_FMT_YUV420P16BE:
2963     case PIX_FMT_YUV422P16BE:
2964     case PIX_FMT_YUV444P16BE:
2965     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2966 #endif
2967     case PIX_FMT_YUYV422  :
2968     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2969     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2970     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2971     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2972     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2973     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2974     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2975     case PIX_FMT_BGR444LE : c->lumToYV12 = bgr12leToY_c; break;
2976     case PIX_FMT_BGR444BE : c->lumToYV12 = bgr12beToY_c; break;
2977     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2978     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2979     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2980     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2981     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2982     case PIX_FMT_RGB444LE : c->lumToYV12 = rgb12leToY_c; break;
2983     case PIX_FMT_RGB444BE : c->lumToYV12 = rgb12beToY_c; break;
2984     case PIX_FMT_RGB8     :
2985     case PIX_FMT_BGR8     :
2986     case PIX_FMT_PAL8     :
2987     case PIX_FMT_BGR4_BYTE:
2988     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2989     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2990     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2991     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2992     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2993     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2994     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2995     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2996     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2997     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2998     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2999     }
3000     if (c->alpPixBuf) {
3001         switch (srcFormat) {
3002         case PIX_FMT_BGRA:
3003         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
3004         case PIX_FMT_ABGR:
3005         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
3006         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
3007         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
3008         }
3009     }
3010
3011
3012     if (c->srcBpc == 8) {
3013         if (c->dstBpc <= 10) {
3014             c->hyScale = c->hcScale = hScale8To15_c;
3015             if (c->flags & SWS_FAST_BILINEAR) {
3016                 c->hyscale_fast = hyscale_fast_c;
3017                 c->hcscale_fast = hcscale_fast_c;
3018             }
3019         } else {
3020             c->hyScale = c->hcScale = hScale8To19_c;
3021         }
3022     } else {
3023         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
3024     }
3025
3026     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3027         if (c->dstBpc <= 10) {
3028             if (c->srcRange) {
3029                 c->lumConvertRange = lumRangeFromJpeg_c;
3030                 c->chrConvertRange = chrRangeFromJpeg_c;
3031             } else {
3032                 c->lumConvertRange = lumRangeToJpeg_c;
3033                 c->chrConvertRange = chrRangeToJpeg_c;
3034             }
3035         } else {
3036             if (c->srcRange) {
3037                 c->lumConvertRange = lumRangeFromJpeg16_c;
3038                 c->chrConvertRange = chrRangeFromJpeg16_c;
3039             } else {
3040                 c->lumConvertRange = lumRangeToJpeg16_c;
3041                 c->chrConvertRange = chrRangeToJpeg16_c;
3042             }
3043         }
3044     }
3045
3046     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3047           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3048         c->needs_hcscale = 1;
3049 }
3050
3051 SwsFunc ff_getSwsFunc(SwsContext *c)
3052 {
3053     sws_init_swScale_c(c);
3054
3055     if (HAVE_MMX)
3056         ff_sws_init_swScale_mmx(c);
3057     if (HAVE_ALTIVEC)
3058         ff_sws_init_swScale_altivec(c);
3059
3060     return swScale;
3061 }