git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/avassert.h"
  64 #include "libavutil/intreadwrite.h"
  65 #include "libavutil/cpu.h"
  66 #include "libavutil/avutil.h"
  67 #include "libavutil/mathematics.h"
  68 #include "libavutil/bswap.h"
  69 #include "libavutil/pixdesc.h"
  70
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 186 {  36, 68, 60, 92, 34, 66, 58, 90,},
 187 { 100,  4,124, 28, 98,  2,122, 26,},
 188 {  52, 84, 44, 76, 50, 82, 42, 74,},
 189 { 116, 20,108, 12,114, 18,106, 10,},
 190 {  32, 64, 56, 88, 38, 70, 62, 94,},
 191 {  96,  0,120, 24,102,  6,126, 30,},
 192 {  48, 80, 40, 72, 54, 86, 46, 78,},
 193 { 112, 16,104,  8,118, 22,110, 14,},
 194 };
 195 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 196 {  64, 64, 64, 64, 64, 64, 64, 64 };
 197
 198 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 199 {
 200   {   0,  1,  0,  1,  0,  1,  0,  1,},
 201   {   1,  0,  1,  0,  1,  0,  1,  0,},
 202   {   0,  1,  0,  1,  0,  1,  0,  1,},
 203   {   1,  0,  1,  0,  1,  0,  1,  0,},
 204   {   0,  1,  0,  1,  0,  1,  0,  1,},
 205   {   1,  0,  1,  0,  1,  0,  1,  0,},
 206   {   0,  1,  0,  1,  0,  1,  0,  1,},
 207   {   1,  0,  1,  0,  1,  0,  1,  0,},
 208 },{
 209   {   1,  2,  1,  2,  1,  2,  1,  2,},
 210   {   3,  0,  3,  0,  3,  0,  3,  0,},
 211   {   1,  2,  1,  2,  1,  2,  1,  2,},
 212   {   3,  0,  3,  0,  3,  0,  3,  0,},
 213   {   1,  2,  1,  2,  1,  2,  1,  2,},
 214   {   3,  0,  3,  0,  3,  0,  3,  0,},
 215   {   1,  2,  1,  2,  1,  2,  1,  2,},
 216   {   3,  0,  3,  0,  3,  0,  3,  0,},
 217 },{
 218   {   2,  4,  3,  5,  2,  4,  3,  5,},
 219   {   6,  0,  7,  1,  6,  0,  7,  1,},
 220   {   3,  5,  2,  4,  3,  5,  2,  4,},
 221   {   7,  1,  6,  0,  7,  1,  6,  0,},
 222   {   2,  4,  3,  5,  2,  4,  3,  5,},
 223   {   6,  0,  7,  1,  6,  0,  7,  1,},
 224   {   3,  5,  2,  4,  3,  5,  2,  4,},
 225   {   7,  1,  6,  0,  7,  1,  6,  0,},
 226 },{
 227   {   4,  8,  7, 11,  4,  8,  7, 11,},
 228   {  12,  0, 15,  3, 12,  0, 15,  3,},
 229   {   6, 10,  5,  9,  6, 10,  5,  9,},
 230   {  14,  2, 13,  1, 14,  2, 13,  1,},
 231   {   4,  8,  7, 11,  4,  8,  7, 11,},
 232   {  12,  0, 15,  3, 12,  0, 15,  3,},
 233   {   6, 10,  5,  9,  6, 10,  5,  9,},
 234   {  14,  2, 13,  1, 14,  2, 13,  1,},
 235 },{
 236   {   9, 17, 15, 23,  8, 16, 14, 22,},
 237   {  25,  1, 31,  7, 24,  0, 30,  6,},
 238   {  13, 21, 11, 19, 12, 20, 10, 18,},
 239   {  29,  5, 27,  3, 28,  4, 26,  2,},
 240   {   8, 16, 14, 22,  9, 17, 15, 23,},
 241   {  24,  0, 30,  6, 25,  1, 31,  7,},
 242   {  12, 20, 10, 18, 13, 21, 11, 19,},
 243   {  28,  4, 26,  2, 29,  5, 27,  3,},
 244 },{
 245   {  18, 34, 30, 46, 17, 33, 29, 45,},
 246   {  50,  2, 62, 14, 49,  1, 61, 13,},
 247   {  26, 42, 22, 38, 25, 41, 21, 37,},
 248   {  58, 10, 54,  6, 57,  9, 53,  5,},
 249   {  16, 32, 28, 44, 19, 35, 31, 47,},
 250   {  48,  0, 60, 12, 51,  3, 63, 15,},
 251   {  24, 40, 20, 36, 27, 43, 23, 39,},
 252   {  56,  8, 52,  4, 59, 11, 55,  7,},
 253 },{
 254   {  18, 34, 30, 46, 17, 33, 29, 45,},
 255   {  50,  2, 62, 14, 49,  1, 61, 13,},
 256   {  26, 42, 22, 38, 25, 41, 21, 37,},
 257   {  58, 10, 54,  6, 57,  9, 53,  5,},
 258   {  16, 32, 28, 44, 19, 35, 31, 47,},
 259   {  48,  0, 60, 12, 51,  3, 63, 15,},
 260   {  24, 40, 20, 36, 27, 43, 23, 39,},
 261   {  56,  8, 52,  4, 59, 11, 55,  7,},
 262 },{
 263   {  36, 68, 60, 92, 34, 66, 58, 90,},
 264   { 100,  4,124, 28, 98,  2,122, 26,},
 265   {  52, 84, 44, 76, 50, 82, 42, 74,},
 266   { 116, 20,108, 12,114, 18,106, 10,},
 267   {  32, 64, 56, 88, 38, 70, 62, 94,},
 268   {  96,  0,120, 24,102,  6,126, 30,},
 269   {  48, 80, 40, 72, 54, 86, 46, 78,},
 270   { 112, 16,104,  8,118, 22,110, 14,},
 271 }};
 272
 273 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 274
 275 const uint16_t dither_scale[15][16]={
 276 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 277 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 278 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 279 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 280 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 281 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 282 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 283 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 284 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 285 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 286 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 287 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 288 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 289 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 290 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 291 };
 292
 293 #define output_pixel(pos, val, bias, signedness) \
 294     if (big_endian) { \
 295         AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 296     } else { \
 297         AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 298     }
 299
 300 static av_always_inline void
 301 yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
 302                          int big_endian, int output_bits)
 303 {
 304     int i;
 305     int shift = 3;
 306     av_assert0(output_bits == 16);
 307
 308     for (i = 0; i < dstW; i++) {
 309         int val = src[i] + (1 << (shift - 1));
 310         output_pixel(&dest[i], val, 0, uint);
 311     }
 312 }
 313
 314 static av_always_inline void
 315 yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
 316                          const int32_t **src, uint16_t *dest, int dstW,
 317                          int big_endian, int output_bits)
 318 {
 319     int i;
 320     int shift = 15;
 321     av_assert0(output_bits == 16);
 322
 323     for (i = 0; i < dstW; i++) {
 324         int val = 1 << (shift - 1);
 325         int j;
 326
 327         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
 328          * filters (or anything with negative coeffs, the range can be slightly
 329          * wider in both directions. To account for this overflow, we subtract
 330          * a constant so it always fits in the signed range (assuming a
 331          * reasonable filterSize), and re-add that at the end. */
 332         val -= 0x40000000;
 333         for (j = 0; j < filterSize; j++)
 334             val += src[j][i] * filter[j];
 335
 336         output_pixel(&dest[i], val, 0x8000, int);
 337     }
 338 }
 339
 340 #undef output_pixel
 341
 342 #define output_pixel(pos, val) \
 343     if (big_endian) { \
 344         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 345     } else { \
 346         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 347     }
 348
 349 static av_always_inline void
 350 yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
 351                          int big_endian, int output_bits)
 352 {
 353     int i;
 354     int shift = 15 - output_bits;
 355
 356     for (i = 0; i < dstW; i++) {
 357         int val = src[i] + (1 << (shift - 1));
 358         output_pixel(&dest[i], val);
 359     }
 360 }
 361
 362 static av_always_inline void
 363 yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
 364                          const int16_t **src, uint16_t *dest, int dstW,
 365                          int big_endian, int output_bits)
 366 {
 367     int i;
 368     int shift = 11 + 16 - output_bits;
 369
 370     for (i = 0; i < dstW; i++) {
 371         int val = 1 << (shift - 1);
 372         int j;
 373
 374         for (j = 0; j < filterSize; j++)
 375             val += src[j][i] * filter[j];
 376
 377         output_pixel(&dest[i], val);
 378     }
 379 }
 380
 381 #undef output_pixel
 382
 383 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
 384 static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
 385                               uint8_t *dest, int dstW, \
 386                               const uint8_t *dither, int offset)\
 387 { \
 388     yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
 389                          (uint16_t *) dest, dstW, is_be, bits); \
 390 }\
 391 static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
 392                               const int16_t **src, uint8_t *dest, int dstW, \
 393                               const uint8_t *dither, int offset)\
 394 { \
 395     yuv2planeX_## template_size ## _c_template(filter, \
 396                          filterSize, (const typeX_t **) src, \
 397                          (uint16_t *) dest, dstW, is_be, bits); \
 398 }
 399 yuv2NBPS( 9, BE, 1, 10, int16_t);
 400 yuv2NBPS( 9, LE, 0, 10, int16_t);
 401 yuv2NBPS(10, BE, 1, 10, int16_t);
 402 yuv2NBPS(10, LE, 0, 10, int16_t);
 403 yuv2NBPS(16, BE, 1, 16, int32_t);
 404 yuv2NBPS(16, LE, 0, 16, int32_t);
 405
 406 static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
 407                            const int16_t **src, uint8_t *dest, int dstW,
 408                            const uint8_t *dither, int offset)
 409 {
 410     int i;
 411     for (i=0; i<dstW; i++) {
 412         int val = dither[(i + offset) & 7] << 12;
 413         int j;
 414         for (j=0; j<filterSize; j++)
 415             val += src[j][i] * filter[j];
 416
 417         dest[i]= av_clip_uint8(val>>19);
 418     }
 419 }
 420
 421 static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
 422                            const uint8_t *dither, int offset)
 423 {
 424     int i;
 425     for (i=0; i<dstW; i++) {
 426         int val = (src[i] + dither[(i + offset) & 7]) >> 7;
 427         dest[i]= av_clip_uint8(val);
 428     }
 429 }
 430
 431 static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
 432                         const int16_t **chrUSrc, const int16_t **chrVSrc,
 433                         uint8_t *dest, int chrDstW)
 434 {
 435     enum PixelFormat dstFormat = c->dstFormat;
 436     const uint8_t *chrDither = c->chrDither8;
 437     int i;
 438
 439     if (dstFormat == PIX_FMT_NV12)
 440         for (i=0; i<chrDstW; i++) {
 441             int u = chrDither[i & 7] << 12;
 442             int v = chrDither[(i + 3) & 7] << 12;
 443             int j;
 444             for (j=0; j<chrFilterSize; j++) {
 445                 u += chrUSrc[j][i] * chrFilter[j];
 446                 v += chrVSrc[j][i] * chrFilter[j];
 447             }
 448
 449             dest[2*i]= av_clip_uint8(u>>19);
 450             dest[2*i+1]= av_clip_uint8(v>>19);
 451         }
 452     else
 453         for (i=0; i<chrDstW; i++) {
 454             int u = chrDither[i & 7] << 12;
 455             int v = chrDither[(i + 3) & 7] << 12;
 456             int j;
 457             for (j=0; j<chrFilterSize; j++) {
 458                 u += chrUSrc[j][i] * chrFilter[j];
 459                 v += chrVSrc[j][i] * chrFilter[j];
 460             }
 461
 462             dest[2*i]= av_clip_uint8(v>>19);
 463             dest[2*i+1]= av_clip_uint8(u>>19);
 464         }
 465 }
 466
 467 #define output_pixel(pos, val) \
 468         if (target == PIX_FMT_GRAY16BE) { \
 469             AV_WB16(pos, val); \
 470         } else { \
 471             AV_WL16(pos, val); \
 472         }
 473
 474 static av_always_inline void
 475 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 476                         const int32_t **lumSrc, int lumFilterSize,
 477                         const int16_t *chrFilter, const int32_t **chrUSrc,
 478                         const int32_t **chrVSrc, int chrFilterSize,
 479                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 480                         int y, enum PixelFormat target)
 481 {
 482     int i;
 483
 484     for (i = 0; i < (dstW >> 1); i++) {
 485         int j;
 486         int Y1 = 1 << 14;
 487         int Y2 = 1 << 14;
 488
 489         for (j = 0; j < lumFilterSize; j++) {
 490             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 491             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 492         }
 493         Y1 >>= 15;
 494         Y2 >>= 15;
 495         if ((Y1 | Y2) & 0x10000) {
 496             Y1 = av_clip_uint16(Y1);
 497             Y2 = av_clip_uint16(Y2);
 498         }
 499         output_pixel(&dest[i * 2 + 0], Y1);
 500         output_pixel(&dest[i * 2 + 1], Y2);
 501     }
 502 }
 503
 504 static av_always_inline void
 505 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 506                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 507                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 508                         int yalpha, int uvalpha, int y,
 509                         enum PixelFormat target)
 510 {
 511     int  yalpha1 = 4095 - yalpha;
 512     int i;
 513     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 514
 515     for (i = 0; i < (dstW >> 1); i++) {
 516         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 517         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 518
 519         output_pixel(&dest[i * 2 + 0], Y1);
 520         output_pixel(&dest[i * 2 + 1], Y2);
 521     }
 522 }
 523
 524 static av_always_inline void
 525 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 526                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 527                         const int32_t *abuf0, uint16_t *dest, int dstW,
 528                         int uvalpha, int y, enum PixelFormat target)
 529 {
 530     int i;
 531
 532     for (i = 0; i < (dstW >> 1); i++) {
 533         int Y1 = (buf0[i * 2    ]+4)>>3;
 534         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 535
 536         output_pixel(&dest[i * 2 + 0], Y1);
 537         output_pixel(&dest[i * 2 + 1], Y2);
 538     }
 539 }
 540
 541 #undef output_pixel
 542
 543 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 544 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 545                         const int16_t **_lumSrc, int lumFilterSize, \
 546                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 547                         const int16_t **_chrVSrc, int chrFilterSize, \
 548                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 549                         int y) \
 550 { \
 551     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 552                   **chrUSrc = (const int32_t **) _chrUSrc, \
 553                   **chrVSrc = (const int32_t **) _chrVSrc, \
 554                   **alpSrc  = (const int32_t **) _alpSrc; \
 555     uint16_t *dest = (uint16_t *) _dest; \
 556     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 557                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 558                           alpSrc, dest, dstW, y, fmt); \
 559 } \
 560  \
 561 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 562                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 563                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 564                         int yalpha, int uvalpha, int y) \
 565 { \
 566     const int32_t **buf  = (const int32_t **) _buf, \
 567                   **ubuf = (const int32_t **) _ubuf, \
 568                   **vbuf = (const int32_t **) _vbuf, \
 569                   **abuf = (const int32_t **) _abuf; \
 570     uint16_t *dest = (uint16_t *) _dest; \
 571     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 572                           dest, dstW, yalpha, uvalpha, y, fmt); \
 573 } \
 574  \
 575 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 576                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 577                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 578                         int uvalpha, int y) \
 579 { \
 580     const int32_t *buf0  = (const int32_t *)  _buf0, \
 581                  **ubuf  = (const int32_t **) _ubuf, \
 582                  **vbuf  = (const int32_t **) _vbuf, \
 583                   *abuf0 = (const int32_t *)  _abuf0; \
 584     uint16_t *dest = (uint16_t *) _dest; \
 585     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 586                                   dstW, uvalpha, y, fmt); \
 587 }
 588
 589 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 590 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 591
 592 #define output_pixel(pos, acc) \
 593     if (target == PIX_FMT_MONOBLACK) { \
 594         pos = acc; \
 595     } else { \
 596         pos = ~acc; \
 597     }
 598
 599 static av_always_inline void
 600 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 601                       const int16_t **lumSrc, int lumFilterSize,
 602                       const int16_t *chrFilter, const int16_t **chrUSrc,
 603                       const int16_t **chrVSrc, int chrFilterSize,
 604                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 605                       int y, enum PixelFormat target)
 606 {
 607     const uint8_t * const d128=dither_8x8_220[y&7];
 608     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 609     int i;
 610     unsigned acc = 0;
 611
 612     for (i = 0; i < dstW - 1; i += 2) {
 613         int j;
 614         int Y1 = 1 << 18;
 615         int Y2 = 1 << 18;
 616
 617         for (j = 0; j < lumFilterSize; j++) {
 618             Y1 += lumSrc[j][i]   * lumFilter[j];
 619             Y2 += lumSrc[j][i+1] * lumFilter[j];
 620         }
 621         Y1 >>= 19;
 622         Y2 >>= 19;
 623         if ((Y1 | Y2) & 0x100) {
 624             Y1 = av_clip_uint8(Y1);
 625             Y2 = av_clip_uint8(Y2);
 626         }
 627         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 628         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 629         if ((i & 7) == 6) {
 630             output_pixel(*dest++, acc);
 631         }
 632     }
 633 }
 634
 635 static av_always_inline void
 636 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 637                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 638                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 639                       int yalpha, int uvalpha, int y,
 640                       enum PixelFormat target)
 641 {
 642     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 643     const uint8_t * const d128 = dither_8x8_220[y & 7];
 644     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 645     int  yalpha1 = 4095 - yalpha;
 646     int i;
 647
 648     for (i = 0; i < dstW - 7; i += 8) {
 649         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 650         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 651         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 652         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 653         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 654         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 655         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 656         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 657         output_pixel(*dest++, acc);
 658     }
 659 }
 660
 661 static av_always_inline void
 662 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 663                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 664                       const int16_t *abuf0, uint8_t *dest, int dstW,
 665                       int uvalpha, int y, enum PixelFormat target)
 666 {
 667     const uint8_t * const d128 = dither_8x8_220[y & 7];
 668     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 669     int i;
 670
 671     for (i = 0; i < dstW - 7; i += 8) {
 672         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 673         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 674         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 675         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 676         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 677         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 678         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 679         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 680         output_pixel(*dest++, acc);
 681     }
 682 }
 683
 684 #undef output_pixel
 685
 686 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 687 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 688                                 const int16_t **lumSrc, int lumFilterSize, \
 689                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 690                                 const int16_t **chrVSrc, int chrFilterSize, \
 691                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 692                                 int y) \
 693 { \
 694     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 695                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 696                                   alpSrc, dest, dstW, y, fmt); \
 697 } \
 698  \
 699 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 700                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 701                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 702                                 int yalpha, int uvalpha, int y) \
 703 { \
 704     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 705                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 706 } \
 707  \
 708 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 709                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 710                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 711                                 int uvalpha, int y) \
 712 { \
 713     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 714                                   abuf0, dest, dstW, uvalpha, \
 715                                   y, fmt); \
 716 }
 717
 718 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 719 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 720
 721 #define output_pixels(pos, Y1, U, Y2, V) \
 722     if (target == PIX_FMT_YUYV422) { \
 723         dest[pos + 0] = Y1; \
 724         dest[pos + 1] = U;  \
 725         dest[pos + 2] = Y2; \
 726         dest[pos + 3] = V;  \
 727     } else { \
 728         dest[pos + 0] = U;  \
 729         dest[pos + 1] = Y1; \
 730         dest[pos + 2] = V;  \
 731         dest[pos + 3] = Y2; \
 732     }
 733
 734 static av_always_inline void
 735 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 736                      const int16_t **lumSrc, int lumFilterSize,
 737                      const int16_t *chrFilter, const int16_t **chrUSrc,
 738                      const int16_t **chrVSrc, int chrFilterSize,
 739                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 740                      int y, enum PixelFormat target)
 741 {
 742     int i;
 743
 744     for (i = 0; i < (dstW >> 1); i++) {
 745         int j;
 746         int Y1 = 1 << 18;
 747         int Y2 = 1 << 18;
 748         int U  = 1 << 18;
 749         int V  = 1 << 18;
 750
 751         for (j = 0; j < lumFilterSize; j++) {
 752             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 753             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 754         }
 755         for (j = 0; j < chrFilterSize; j++) {
 756             U += chrUSrc[j][i] * chrFilter[j];
 757             V += chrVSrc[j][i] * chrFilter[j];
 758         }
 759         Y1 >>= 19;
 760         Y2 >>= 19;
 761         U  >>= 19;
 762         V  >>= 19;
 763         if ((Y1 | Y2 | U | V) & 0x100) {
 764             Y1 = av_clip_uint8(Y1);
 765             Y2 = av_clip_uint8(Y2);
 766             U  = av_clip_uint8(U);
 767             V  = av_clip_uint8(V);
 768         }
 769         output_pixels(4*i, Y1, U, Y2, V);
 770     }
 771 }
 772
 773 static av_always_inline void
 774 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 775                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 776                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 777                      int yalpha, int uvalpha, int y,
 778                      enum PixelFormat target)
 779 {
 780     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 781                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 782                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 783     int  yalpha1 = 4095 - yalpha;
 784     int uvalpha1 = 4095 - uvalpha;
 785     int i;
 786
 787     for (i = 0; i < (dstW >> 1); i++) {
 788         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 789         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 790         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 791         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 792
 793         output_pixels(i * 4, Y1, U, Y2, V);
 794     }
 795 }
 796
 797 static av_always_inline void
 798 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 799                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 800                      const int16_t *abuf0, uint8_t *dest, int dstW,
 801                      int uvalpha, int y, enum PixelFormat target)
 802 {
 803     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 804                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 805     int i;
 806
 807     if (uvalpha < 2048) {
 808         for (i = 0; i < (dstW >> 1); i++) {
 809             int Y1 = buf0[i * 2]     >> 7;
 810             int Y2 = buf0[i * 2 + 1] >> 7;
 811             int U  = ubuf1[i]        >> 7;
 812             int V  = vbuf1[i]        >> 7;
 813
 814             output_pixels(i * 4, Y1, U, Y2, V);
 815         }
 816     } else {
 817         for (i = 0; i < (dstW >> 1); i++) {
 818             int Y1 =  buf0[i * 2]          >> 7;
 819             int Y2 =  buf0[i * 2 + 1]      >> 7;
 820             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 821             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 822
 823             output_pixels(i * 4, Y1, U, Y2, V);
 824         }
 825     }
 826 }
 827
 828 #undef output_pixels
 829
 830 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 831 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 832
 833 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 834 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 835 #define output_pixel(pos, val) \
 836     if (isBE(target)) { \
 837         AV_WB16(pos, val); \
 838     } else { \
 839         AV_WL16(pos, val); \
 840     }
 841
 842 static av_always_inline void
 843 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 844                        const int32_t **lumSrc, int lumFilterSize,
 845                        const int16_t *chrFilter, const int32_t **chrUSrc,
 846                        const int32_t **chrVSrc, int chrFilterSize,
 847                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 848                        int y, enum PixelFormat target)
 849 {
 850     int i;
 851
 852     for (i = 0; i < (dstW >> 1); i++) {
 853         int j;
 854         int Y1 = 0;
 855         int Y2 = 0;
 856         int U  = -128 << 23; // 19
 857         int V  = -128 << 23;
 858         int R, G, B;
 859
 860         for (j = 0; j < lumFilterSize; j++) {
 861             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 862             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 863         }
 864         for (j = 0; j < chrFilterSize; j++) {
 865             U += chrUSrc[j][i] * chrFilter[j];
 866             V += chrVSrc[j][i] * chrFilter[j];
 867         }
 868
 869         // 8bit: 12+15=27; 16-bit: 12+19=31
 870         Y1 >>= 14; // 10
 871         Y2 >>= 14;
 872         U  >>= 14;
 873         V  >>= 14;
 874
 875         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 876         Y1 -= c->yuv2rgb_y_offset;
 877         Y2 -= c->yuv2rgb_y_offset;
 878         Y1 *= c->yuv2rgb_y_coeff;
 879         Y2 *= c->yuv2rgb_y_coeff;
 880         Y1 += 1 << 13; // 21
 881         Y2 += 1 << 13;
 882         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 883
 884         R = V * c->yuv2rgb_v2r_coeff;
 885         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 886         B =                            U * c->yuv2rgb_u2b_coeff;
 887
 888         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 889         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 890         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 891         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 892         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 893         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 894         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 895         dest += 6;
 896     }
 897 }
 898
 899 static av_always_inline void
 900 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 901                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 902                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 903                        int yalpha, int uvalpha, int y,
 904                        enum PixelFormat target)
 905 {
 906     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 907                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 908                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 909     int  yalpha1 = 4095 - yalpha;
 910     int uvalpha1 = 4095 - uvalpha;
 911     int i;
 912
 913     for (i = 0; i < (dstW >> 1); i++) {
 914         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 915         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 916         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 917         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 918         int R, G, B;
 919
 920         Y1 -= c->yuv2rgb_y_offset;
 921         Y2 -= c->yuv2rgb_y_offset;
 922         Y1 *= c->yuv2rgb_y_coeff;
 923         Y2 *= c->yuv2rgb_y_coeff;
 924         Y1 += 1 << 13;
 925         Y2 += 1 << 13;
 926
 927         R = V * c->yuv2rgb_v2r_coeff;
 928         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 929         B =                            U * c->yuv2rgb_u2b_coeff;
 930
 931         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 932         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 933         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 934         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 935         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 936         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 937         dest += 6;
 938     }
 939 }
 940
 941 static av_always_inline void
 942 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 943                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 944                        const int32_t *abuf0, uint16_t *dest, int dstW,
 945                        int uvalpha, int y, enum PixelFormat target)
 946 {
 947     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 948                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 949     int i;
 950
 951     if (uvalpha < 2048) {
 952         for (i = 0; i < (dstW >> 1); i++) {
 953             int Y1 = (buf0[i * 2]    ) >> 2;
 954             int Y2 = (buf0[i * 2 + 1]) >> 2;
 955             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 956             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 957             int R, G, B;
 958
 959             Y1 -= c->yuv2rgb_y_offset;
 960             Y2 -= c->yuv2rgb_y_offset;
 961             Y1 *= c->yuv2rgb_y_coeff;
 962             Y2 *= c->yuv2rgb_y_coeff;
 963             Y1 += 1 << 13;
 964             Y2 += 1 << 13;
 965
 966             R = V * c->yuv2rgb_v2r_coeff;
 967             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 968             B =                            U * c->yuv2rgb_u2b_coeff;
 969
 970             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 971             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 972             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 973             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 974             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 975             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 976             dest += 6;
 977         }
 978     } else {
 979         for (i = 0; i < (dstW >> 1); i++) {
 980             int Y1 = (buf0[i * 2]    ) >> 2;
 981             int Y2 = (buf0[i * 2 + 1]) >> 2;
 982             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
 983             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
 984             int R, G, B;
 985
 986             Y1 -= c->yuv2rgb_y_offset;
 987             Y2 -= c->yuv2rgb_y_offset;
 988             Y1 *= c->yuv2rgb_y_coeff;
 989             Y2 *= c->yuv2rgb_y_coeff;
 990             Y1 += 1 << 13;
 991             Y2 += 1 << 13;
 992
 993             R = V * c->yuv2rgb_v2r_coeff;
 994             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 995             B =                            U * c->yuv2rgb_u2b_coeff;
 996
 997             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 998             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 999             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1000             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1001             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1002             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1003             dest += 6;
1004         }
1005     }
1006 }
1007
1008 #undef output_pixel
1009 #undef r_b
1010 #undef b_r
1011
1012 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
1013 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
1014 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
1015 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
1016
1017 static av_always_inline void
1018 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
1019               int U, int V, int A1, int A2,
1020               const void *_r, const void *_g, const void *_b, int y,
1021               enum PixelFormat target, int hasAlpha)
1022 {
1023     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
1024         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1025         uint32_t *dest = (uint32_t *) _dest;
1026         const uint32_t *r = (const uint32_t *) _r;
1027         const uint32_t *g = (const uint32_t *) _g;
1028         const uint32_t *b = (const uint32_t *) _b;
1029
1030 #if CONFIG_SMALL
1031         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1032
1033         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1034         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1035 #else
1036         if (hasAlpha) {
1037             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1038
1039             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1040             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1041         } else {
1042             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1043             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1044         }
1045 #endif
1046     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1047         uint8_t *dest = (uint8_t *) _dest;
1048         const uint8_t *r = (const uint8_t *) _r;
1049         const uint8_t *g = (const uint8_t *) _g;
1050         const uint8_t *b = (const uint8_t *) _b;
1051
1052 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1053 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1054
1055         dest[i * 6 + 0] = r_b[Y1];
1056         dest[i * 6 + 1] =   g[Y1];
1057         dest[i * 6 + 2] = b_r[Y1];
1058         dest[i * 6 + 3] = r_b[Y2];
1059         dest[i * 6 + 4] =   g[Y2];
1060         dest[i * 6 + 5] = b_r[Y2];
1061 #undef r_b
1062 #undef b_r
1063     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1064                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1065                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1066         uint16_t *dest = (uint16_t *) _dest;
1067         const uint16_t *r = (const uint16_t *) _r;
1068         const uint16_t *g = (const uint16_t *) _g;
1069         const uint16_t *b = (const uint16_t *) _b;
1070         int dr1, dg1, db1, dr2, dg2, db2;
1071
1072         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1073             dr1 = dither_2x2_8[ y & 1     ][0];
1074             dg1 = dither_2x2_4[ y & 1     ][0];
1075             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1076             dr2 = dither_2x2_8[ y & 1     ][1];
1077             dg2 = dither_2x2_4[ y & 1     ][1];
1078             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1079         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1080             dr1 = dither_2x2_8[ y & 1     ][0];
1081             dg1 = dither_2x2_8[ y & 1     ][1];
1082             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1083             dr2 = dither_2x2_8[ y & 1     ][1];
1084             dg2 = dither_2x2_8[ y & 1     ][0];
1085             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1086         } else {
1087             dr1 = dither_4x4_16[ y & 3     ][0];
1088             dg1 = dither_4x4_16[ y & 3     ][1];
1089             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1090             dr2 = dither_4x4_16[ y & 3     ][1];
1091             dg2 = dither_4x4_16[ y & 3     ][0];
1092             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1093         }
1094
1095         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1096         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1097     } else /* 8/4-bit */ {
1098         uint8_t *dest = (uint8_t *) _dest;
1099         const uint8_t *r = (const uint8_t *) _r;
1100         const uint8_t *g = (const uint8_t *) _g;
1101         const uint8_t *b = (const uint8_t *) _b;
1102         int dr1, dg1, db1, dr2, dg2, db2;
1103
1104         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1105             const uint8_t * const d64 = dither_8x8_73[y & 7];
1106             const uint8_t * const d32 = dither_8x8_32[y & 7];
1107             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1108             db1 =       d64[(i * 2 + 0) & 7];
1109             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1110             db2 =       d64[(i * 2 + 1) & 7];
1111         } else {
1112             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1113             const uint8_t * const d128 = dither_8x8_220[y & 7];
1114             dr1 = db1 = d128[(i * 2 + 0) & 7];
1115             dg1 =        d64[(i * 2 + 0) & 7];
1116             dr2 = db2 = d128[(i * 2 + 1) & 7];
1117             dg2 =        d64[(i * 2 + 1) & 7];
1118         }
1119
1120         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1121             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1122                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1123         } else {
1124             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1125             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1126         }
1127     }
1128 }
1129
1130 static av_always_inline void
1131 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1132                      const int16_t **lumSrc, int lumFilterSize,
1133                      const int16_t *chrFilter, const int16_t **chrUSrc,
1134                      const int16_t **chrVSrc, int chrFilterSize,
1135                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1136                      int y, enum PixelFormat target, int hasAlpha)
1137 {
1138     int i;
1139
1140     for (i = 0; i < (dstW >> 1); i++) {
1141         int j;
1142         int Y1 = 1 << 18;
1143         int Y2 = 1 << 18;
1144         int U  = 1 << 18;
1145         int V  = 1 << 18;
1146         int av_unused A1, A2;
1147         const void *r, *g, *b;
1148
1149         for (j = 0; j < lumFilterSize; j++) {
1150             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1151             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1152         }
1153         for (j = 0; j < chrFilterSize; j++) {
1154             U += chrUSrc[j][i] * chrFilter[j];
1155             V += chrVSrc[j][i] * chrFilter[j];
1156         }
1157         Y1 >>= 19;
1158         Y2 >>= 19;
1159         U  >>= 19;
1160         V  >>= 19;
1161         if ((Y1 | Y2 | U | V) & 0x100) {
1162             Y1 = av_clip_uint8(Y1);
1163             Y2 = av_clip_uint8(Y2);
1164             U  = av_clip_uint8(U);
1165             V  = av_clip_uint8(V);
1166         }
1167         if (hasAlpha) {
1168             A1 = 1 << 18;
1169             A2 = 1 << 18;
1170             for (j = 0; j < lumFilterSize; j++) {
1171                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1172                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1173             }
1174             A1 >>= 19;
1175             A2 >>= 19;
1176             if ((A1 | A2) & 0x100) {
1177                 A1 = av_clip_uint8(A1);
1178                 A2 = av_clip_uint8(A2);
1179             }
1180         }
1181
1182         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1183         r =  c->table_rV[V];
1184         g = (c->table_gU[U] + c->table_gV[V]);
1185         b =  c->table_bU[U];
1186
1187         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1188                       r, g, b, y, target, hasAlpha);
1189     }
1190 }
1191
1192 static av_always_inline void
1193 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1194                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1195                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1196                      int yalpha, int uvalpha, int y,
1197                      enum PixelFormat target, int hasAlpha)
1198 {
1199     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1200                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1201                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1202                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1203                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1204     int  yalpha1 = 4095 - yalpha;
1205     int uvalpha1 = 4095 - uvalpha;
1206     int i;
1207
1208     for (i = 0; i < (dstW >> 1); i++) {
1209         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1210         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1211         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1212         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1213         int A1, A2;
1214         const void *r =  c->table_rV[V],
1215                    *g = (c->table_gU[U] + c->table_gV[V]),
1216                    *b =  c->table_bU[U];
1217
1218         if (hasAlpha) {
1219             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1220             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1221         }
1222
1223         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1224                       r, g, b, y, target, hasAlpha);
1225     }
1226 }
1227
1228 static av_always_inline void
1229 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1230                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1231                      const int16_t *abuf0, uint8_t *dest, int dstW,
1232                      int uvalpha, int y, enum PixelFormat target,
1233                      int hasAlpha)
1234 {
1235     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1236                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1237     int i;
1238
1239     if (uvalpha < 2048) {
1240         for (i = 0; i < (dstW >> 1); i++) {
1241             int Y1 = buf0[i * 2]     >> 7;
1242             int Y2 = buf0[i * 2 + 1] >> 7;
1243             int U  = ubuf1[i]        >> 7;
1244             int V  = vbuf1[i]        >> 7;
1245             int A1, A2;
1246             const void *r =  c->table_rV[V],
1247                        *g = (c->table_gU[U] + c->table_gV[V]),
1248                        *b =  c->table_bU[U];
1249
1250             if (hasAlpha) {
1251                 A1 = abuf0[i * 2    ] >> 7;
1252                 A2 = abuf0[i * 2 + 1] >> 7;
1253             }
1254
1255             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1256                           r, g, b, y, target, hasAlpha);
1257         }
1258     } else {
1259         for (i = 0; i < (dstW >> 1); i++) {
1260             int Y1 =  buf0[i * 2]          >> 7;
1261             int Y2 =  buf0[i * 2 + 1]      >> 7;
1262             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1263             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1264             int A1, A2;
1265             const void *r =  c->table_rV[V],
1266                        *g = (c->table_gU[U] + c->table_gV[V]),
1267                        *b =  c->table_bU[U];
1268
1269             if (hasAlpha) {
1270                 A1 = abuf0[i * 2    ] >> 7;
1271                 A2 = abuf0[i * 2 + 1] >> 7;
1272             }
1273
1274             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1275                           r, g, b, y, target, hasAlpha);
1276         }
1277     }
1278 }
1279
1280 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1281 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1282                                 const int16_t **lumSrc, int lumFilterSize, \
1283                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1284                                 const int16_t **chrVSrc, int chrFilterSize, \
1285                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1286                                 int y) \
1287 { \
1288     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1289                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1290                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1291 }
1292 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1293 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1294 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1295                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1296                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1297                                 int yalpha, int uvalpha, int y) \
1298 { \
1299     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1300                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1301 } \
1302  \
1303 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1304                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1305                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1306                                 int uvalpha, int y) \
1307 { \
1308     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1309                                   dstW, uvalpha, y, fmt, hasAlpha); \
1310 }
1311
1312 #if CONFIG_SMALL
1313 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1314 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1315 #else
1316 #if CONFIG_SWSCALE_ALPHA
1317 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1318 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1319 #endif
1320 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1321 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1322 #endif
1323 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1324 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1325 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1326 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1327 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1328 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1329 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1330 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1331
1332 static av_always_inline void
1333 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1334                           const int16_t **lumSrc, int lumFilterSize,
1335                           const int16_t *chrFilter, const int16_t **chrUSrc,
1336                           const int16_t **chrVSrc, int chrFilterSize,
1337                           const int16_t **alpSrc, uint8_t *dest,
1338                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1339 {
1340     int i;
1341     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1342
1343     for (i = 0; i < dstW; i++) {
1344         int j;
1345         int Y = 1<<9;
1346         int U = (1<<9)-(128 << 19);
1347         int V = (1<<9)-(128 << 19);
1348         int av_unused A;
1349         int R, G, B;
1350
1351         for (j = 0; j < lumFilterSize; j++) {
1352             Y += lumSrc[j][i] * lumFilter[j];
1353         }
1354         for (j = 0; j < chrFilterSize; j++) {
1355             U += chrUSrc[j][i] * chrFilter[j];
1356             V += chrVSrc[j][i] * chrFilter[j];
1357         }
1358         Y >>= 10;
1359         U >>= 10;
1360         V >>= 10;
1361         if (hasAlpha) {
1362             A = 1 << 18;
1363             for (j = 0; j < lumFilterSize; j++) {
1364                 A += alpSrc[j][i] * lumFilter[j];
1365             }
1366             A >>= 19;
1367             if (A & 0x100)
1368                 A = av_clip_uint8(A);
1369         }
1370         Y -= c->yuv2rgb_y_offset;
1371         Y *= c->yuv2rgb_y_coeff;
1372         Y += 1 << 21;
1373         R = Y + V*c->yuv2rgb_v2r_coeff;
1374         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1375         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1376         if ((R | G | B) & 0xC0000000) {
1377             R = av_clip_uintp2(R, 30);
1378             G = av_clip_uintp2(G, 30);
1379             B = av_clip_uintp2(B, 30);
1380         }
1381
1382         switch(target) {
1383         case PIX_FMT_ARGB:
1384             dest[0] = hasAlpha ? A : 255;
1385             dest[1] = R >> 22;
1386             dest[2] = G >> 22;
1387             dest[3] = B >> 22;
1388             break;
1389         case PIX_FMT_RGB24:
1390             dest[0] = R >> 22;
1391             dest[1] = G >> 22;
1392             dest[2] = B >> 22;
1393             break;
1394         case PIX_FMT_RGBA:
1395             dest[0] = R >> 22;
1396             dest[1] = G >> 22;
1397             dest[2] = B >> 22;
1398             dest[3] = hasAlpha ? A : 255;
1399             break;
1400         case PIX_FMT_ABGR:
1401             dest[0] = hasAlpha ? A : 255;
1402             dest[1] = B >> 22;
1403             dest[2] = G >> 22;
1404             dest[3] = R >> 22;
1405             break;
1406         case PIX_FMT_BGR24:
1407             dest[0] = B >> 22;
1408             dest[1] = G >> 22;
1409             dest[2] = R >> 22;
1410             break;
1411         case PIX_FMT_BGRA:
1412             dest[0] = B >> 22;
1413             dest[1] = G >> 22;
1414             dest[2] = R >> 22;
1415             dest[3] = hasAlpha ? A : 255;
1416             break;
1417         }
1418         dest += step;
1419     }
1420 }
1421
1422 #if CONFIG_SMALL
1423 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1424 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1425 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1426 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1427 #else
1428 #if CONFIG_SWSCALE_ALPHA
1429 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1430 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1431 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1432 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1433 #endif
1434 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1435 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1436 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1437 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1438 #endif
1439 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1440 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1441
1442 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1443                                        int width, int height,
1444                                        int y, uint8_t val)
1445 {
1446     int i;
1447     uint8_t *ptr = plane + stride*y;
1448     for (i=0; i<height; i++) {
1449         memset(ptr, val, width);
1450         ptr += stride;
1451     }
1452 }
1453
1454 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1455
1456 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1457 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1458
1459 static av_always_inline void
1460 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1461                     enum PixelFormat origin)
1462 {
1463     int i;
1464     for (i = 0; i < width; i++) {
1465         unsigned int r_b = input_pixel(&src[i*3+0]);
1466         unsigned int   g = input_pixel(&src[i*3+1]);
1467         unsigned int b_r = input_pixel(&src[i*3+2]);
1468
1469         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1470     }
1471 }
1472
1473 static av_always_inline void
1474 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1475                     const uint16_t *src1, const uint16_t *src2,
1476                     int width, enum PixelFormat origin)
1477 {
1478     int i;
1479     assert(src1==src2);
1480     for (i = 0; i < width; i++) {
1481         int r_b = input_pixel(&src1[i*3+0]);
1482         int   g = input_pixel(&src1[i*3+1]);
1483         int b_r = input_pixel(&src1[i*3+2]);
1484
1485         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1486         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1487     }
1488 }
1489
1490 static av_always_inline void
1491 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1492                           const uint16_t *src1, const uint16_t *src2,
1493                           int width, enum PixelFormat origin)
1494 {
1495     int i;
1496     assert(src1==src2);
1497     for (i = 0; i < width; i++) {
1498         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1499         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1500         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1501
1502         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1503         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1504     }
1505 }
1506
1507 #undef r
1508 #undef b
1509 #undef input_pixel
1510
1511 #define rgb48funcs(pattern, BE_LE, origin) \
1512 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
1513                                     int width, uint32_t *unused) \
1514 { \
1515     const uint16_t *src = (const uint16_t *) _src; \
1516     uint16_t *dst = (uint16_t *) _dst; \
1517     rgb48ToY_c_template(dst, src, width, origin); \
1518 } \
1519  \
1520 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1521                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1522                                     int width, uint32_t *unused) \
1523 { \
1524     const uint16_t *src1 = (const uint16_t *) _src1, \
1525                    *src2 = (const uint16_t *) _src2; \
1526     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1527     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1528 } \
1529  \
1530 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1531                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1532                                     int width, uint32_t *unused) \
1533 { \
1534     const uint16_t *src1 = (const uint16_t *) _src1, \
1535                    *src2 = (const uint16_t *) _src2; \
1536     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1537     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1538 }
1539
1540 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1541 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1542 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1543 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1544
1545 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1546                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1547                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1548
1549 static av_always_inline void
1550 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1551                        int width, enum PixelFormat origin,
1552                        int shr,   int shg,   int shb, int shp,
1553                        int maskr, int maskg, int maskb,
1554                        int rsh,   int gsh,   int bsh, int S)
1555 {
1556     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1557               rnd = (32<<((S)-1)) + (1<<(S-7));
1558     int i;
1559
1560     for (i = 0; i < width; i++) {
1561         int px = input_pixel(i) >> shp;
1562         int b = (px & maskb) >> shb;
1563         int g = (px & maskg) >> shg;
1564         int r = (px & maskr) >> shr;
1565
1566         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1567     }
1568 }
1569
1570 static av_always_inline void
1571 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1572                         const uint8_t *src, int width,
1573                         enum PixelFormat origin,
1574                         int shr,   int shg,   int shb, int shp,
1575                         int maskr, int maskg, int maskb,
1576                         int rsh,   int gsh,   int bsh, int S)
1577 {
1578     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1579               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1580               rnd = (256<<((S)-1)) + (1<<(S-7));
1581     int i;
1582
1583     for (i = 0; i < width; i++) {
1584         int px = input_pixel(i) >> shp;
1585         int b = (px & maskb) >> shb;
1586         int g = (px & maskg) >> shg;
1587         int r = (px & maskr) >> shr;
1588
1589         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1590         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1591     }
1592 }
1593
1594 static av_always_inline void
1595 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1596                              const uint8_t *src, int width,
1597                              enum PixelFormat origin,
1598                              int shr,   int shg,   int shb, int shp,
1599                              int maskr, int maskg, int maskb,
1600                              int rsh,   int gsh,   int bsh, int S)
1601 {
1602     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1603               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1604               rnd = (256U<<(S)) + (1<<(S-6)), maskgx = ~(maskr | maskb);
1605     int i;
1606
1607     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1608     for (i = 0; i < width; i++) {
1609         int px0 = input_pixel(2 * i + 0) >> shp;
1610         int px1 = input_pixel(2 * i + 1) >> shp;
1611         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1612         int rb = px0 + px1 - g;
1613
1614         b = (rb & maskb) >> shb;
1615         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1616             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1617             g >>= shg;
1618         } else {
1619             g = (g  & maskg) >> shg;
1620         }
1621         r = (rb & maskr) >> shr;
1622
1623         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1624         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1625     }
1626 }
1627
1628 #undef input_pixel
1629
1630 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1631                          maskg, maskb, rsh, gsh, bsh, S) \
1632 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
1633                           int width, uint32_t *unused) \
1634 { \
1635     rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, \
1636                            shr, shg, shb, shp, \
1637                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1638 } \
1639  \
1640 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1641                            const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1642                            int width, uint32_t *unused) \
1643 { \
1644     rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,  \
1645                             shr, shg, shb, shp, \
1646                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1647 } \
1648  \
1649 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1650                                 const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1651                                 int width, uint32_t *unused) \
1652 { \
1653     rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
1654                                  shr, shg, shb, shp, \
1655                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1656 }
1657
1658 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1659 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1660 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1661 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1662 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1663 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1664 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1665 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1666 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1667 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1668 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1669 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1670
1671 static void gbr24pToUV_half_c(uint16_t *dstU, uint16_t *dstV,
1672                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
1673                          int width, enum PixelFormat origin)
1674 {
1675     int i;
1676     for (i = 0; i < width; i++) {
1677         unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
1678         unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
1679         unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
1680
1681         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1682         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1683     }
1684 }
1685
1686 static void abgrToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1687 {
1688     int i;
1689     for (i=0; i<width; i++) {
1690         dst[i]= src[4*i]<<6;
1691     }
1692 }
1693
1694 static void rgbaToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1695 {
1696     int i;
1697     for (i=0; i<width; i++) {
1698         dst[i]= src[4*i+3]<<6;
1699     }
1700 }
1701
1702 static void palToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
1703 {
1704     int i;
1705     for (i=0; i<width; i++) {
1706         int d= src[i];
1707
1708         dst[i]= (pal[d] >> 24)<<6;
1709     }
1710 }
1711
1712 static void palToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, long width, uint32_t *pal)
1713 {
1714     int i;
1715     for (i=0; i<width; i++) {
1716         int d= src[i];
1717
1718         dst[i]= (pal[d] & 0xFF)<<6;
1719     }
1720 }
1721
1722 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1723                            const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1724                            int width, uint32_t *pal)
1725 {
1726     int i;
1727     assert(src1 == src2);
1728     for (i=0; i<width; i++) {
1729         int p= pal[src1[i]];
1730
1731         dstU[i]= (uint8_t)(p>> 8)<<6;
1732         dstV[i]= (uint8_t)(p>>16)<<6;
1733     }
1734 }
1735
1736 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1737 {
1738     int i, j;
1739     for (i=0; i<width/8; i++) {
1740         int d= ~src[i];
1741         for(j=0; j<8; j++)
1742             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1743     }
1744     if(width&7){
1745         int d= ~src[i];
1746         for(j=0; j<(width&7); j++)
1747             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1748     }
1749 }
1750
1751 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1752 {
1753     int i, j;
1754     for (i=0; i<width/8; i++) {
1755         int d= src[i];
1756         for(j=0; j<8; j++)
1757             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1758     }
1759     if(width&7){
1760         int d= src[i];
1761         for(j=0; j<(width&7); j++)
1762             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1763     }
1764 }
1765
1766 //FIXME yuy2* can read up to 7 samples too much
1767
1768 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1769                       uint32_t *unused)
1770 {
1771     int i;
1772     for (i=0; i<width; i++)
1773         dst[i]= src[2*i];
1774 }
1775
1776 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1777                        const uint8_t *src2, int width, uint32_t *unused)
1778 {
1779     int i;
1780     for (i=0; i<width; i++) {
1781         dstU[i]= src1[4*i + 1];
1782         dstV[i]= src1[4*i + 3];
1783     }
1784     assert(src1 == src2);
1785 }
1786
1787 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1788 {
1789     int i;
1790     const uint16_t *src = (const uint16_t *) _src;
1791     uint16_t *dst = (uint16_t *) _dst;
1792     for (i=0; i<width; i++) {
1793         dst[i] = av_bswap16(src[i]);
1794     }
1795 }
1796
1797 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
1798                         const uint8_t *_src2, int width, uint32_t *unused)
1799 {
1800     int i;
1801     const uint16_t *src1 = (const uint16_t *) _src1,
1802                    *src2 = (const uint16_t *) _src2;
1803     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1804     for (i=0; i<width; i++) {
1805         dstU[i] = av_bswap16(src1[i]);
1806         dstV[i] = av_bswap16(src2[i]);
1807     }
1808 }
1809
1810 /* This is almost identical to the previous, end exists only because
1811  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1812 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1813                       uint32_t *unused)
1814 {
1815     int i;
1816     for (i=0; i<width; i++)
1817         dst[i]= src[2*i+1];
1818 }
1819
1820 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1821                        const uint8_t *src2, int width, uint32_t *unused)
1822 {
1823     int i;
1824     for (i=0; i<width; i++) {
1825         dstU[i]= src1[4*i + 0];
1826         dstV[i]= src1[4*i + 2];
1827     }
1828     assert(src1 == src2);
1829 }
1830
1831 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1832                                         const uint8_t *src, int width)
1833 {
1834     int i;
1835     for (i = 0; i < width; i++) {
1836         dst1[i] = src[2*i+0];
1837         dst2[i] = src[2*i+1];
1838     }
1839 }
1840
1841 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1842                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1843                        int width, uint32_t *unused)
1844 {
1845     nvXXtoUV_c(dstU, dstV, src1, width);
1846 }
1847
1848 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1849                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1850                        int width, uint32_t *unused)
1851 {
1852     nvXXtoUV_c(dstV, dstU, src1, width);
1853 }
1854
1855 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1856
1857 static void bgr24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
1858                        int width, uint32_t *unused)
1859 {
1860     int i;
1861     for (i=0; i<width; i++) {
1862         int b= src[i*3+0];
1863         int g= src[i*3+1];
1864         int r= src[i*3+2];
1865
1866         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1867     }
1868 }
1869
1870 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1871                         const uint8_t *src2, int width, uint32_t *unused)
1872 {
1873     int i;
1874     for (i=0; i<width; i++) {
1875         int b= src1[3*i + 0];
1876         int g= src1[3*i + 1];
1877         int r= src1[3*i + 2];
1878
1879         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1880         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1881     }
1882     assert(src1 == src2);
1883 }
1884
1885 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1886                              const uint8_t *src2, int width, uint32_t *unused)
1887 {
1888     int i;
1889     for (i=0; i<width; i++) {
1890         int b= src1[6*i + 0] + src1[6*i + 3];
1891         int g= src1[6*i + 1] + src1[6*i + 4];
1892         int r= src1[6*i + 2] + src1[6*i + 5];
1893
1894         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1895         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1896     }
1897     assert(src1 == src2);
1898 }
1899
1900 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
1901                        uint32_t *unused)
1902 {
1903     int i;
1904     for (i=0; i<width; i++) {
1905         int r= src[i*3+0];
1906         int g= src[i*3+1];
1907         int b= src[i*3+2];
1908
1909         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1910     }
1911 }
1912
1913 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1914                         const uint8_t *src2, int width, uint32_t *unused)
1915 {
1916     int i;
1917     assert(src1==src2);
1918     for (i=0; i<width; i++) {
1919         int r= src1[3*i + 0];
1920         int g= src1[3*i + 1];
1921         int b= src1[3*i + 2];
1922
1923         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1924         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1925     }
1926 }
1927
1928 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1929                                     const uint8_t *src2, int width, uint32_t *unused)
1930 {
1931     int i;
1932     assert(src1==src2);
1933     for (i=0; i<width; i++) {
1934         int r= src1[6*i + 0] + src1[6*i + 3];
1935         int g= src1[6*i + 1] + src1[6*i + 4];
1936         int b= src1[6*i + 2] + src1[6*i + 5];
1937
1938         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1939         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1940     }
1941 }
1942
1943 static void planar_rgb_to_y(uint16_t *dst, const uint8_t *src[4], int width)
1944 {
1945     int i;
1946     for (i = 0; i < width; i++) {
1947         int g = src[0][i];
1948         int b = src[1][i];
1949         int r = src[2][i];
1950
1951         dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1952     }
1953 }
1954
1955 static void planar_rgb16le_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1956 {
1957     int i;
1958     const uint16_t **src = (const uint16_t **) _src;
1959     uint16_t *dst = (uint16_t *) _dst;
1960     for (i = 0; i < width; i++) {
1961         int g = AV_RL16(src[0] + i);
1962         int b = AV_RL16(src[1] + i);
1963         int r = AV_RL16(src[2] + i);
1964
1965         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1966     }
1967 }
1968
1969 static void planar_rgb16be_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1970 {
1971     int i;
1972     const uint16_t **src = (const uint16_t **) _src;
1973     uint16_t *dst = (uint16_t *) _dst;
1974     for (i = 0; i < width; i++) {
1975         int g = AV_RB16(src[0] + i);
1976         int b = AV_RB16(src[1] + i);
1977         int r = AV_RB16(src[2] + i);
1978
1979         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1980     }
1981 }
1982
1983 static void planar_rgb_to_uv(uint16_t *dstU, uint16_t *dstV, const uint8_t *src[4], int width)
1984 {
1985     int i;
1986     for (i = 0; i < width; i++) {
1987         int g = src[0][i];
1988         int b = src[1][i];
1989         int r = src[2][i];
1990
1991         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1992         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1993     }
1994 }
1995
1996 static void planar_rgb16le_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
1997 {
1998     int i;
1999     const uint16_t **src = (const uint16_t **) _src;
2000     uint16_t *dstU = (uint16_t *) _dstU;
2001     uint16_t *dstV = (uint16_t *) _dstV;
2002     for (i = 0; i < width; i++) {
2003         int g = AV_RL16(src[0] + i);
2004         int b = AV_RL16(src[1] + i);
2005         int r = AV_RL16(src[2] + i);
2006
2007         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2008         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2009     }
2010 }
2011
2012 static void planar_rgb16be_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
2013 {
2014     int i;
2015     const uint16_t **src = (const uint16_t **) _src;
2016     uint16_t *dstU = (uint16_t *) _dstU;
2017     uint16_t *dstV = (uint16_t *) _dstV;
2018     for (i = 0; i < width; i++) {
2019         int g = AV_RB16(src[0] + i);
2020         int b = AV_RB16(src[1] + i);
2021         int r = AV_RB16(src[2] + i);
2022
2023         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2024         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2025     }
2026 }
2027
2028 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
2029                            const int16_t *filter,
2030                            const int16_t *filterPos, int filterSize)
2031 {
2032     int i;
2033     int32_t *dst = (int32_t *) _dst;
2034     const uint16_t *src = (const uint16_t *) _src;
2035     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2036     int sh = bits - 4;
2037
2038     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2039         sh= 9;
2040
2041     for (i = 0; i < dstW; i++) {
2042         int j;
2043         int srcPos = filterPos[i];
2044         int val = 0;
2045
2046         for (j = 0; j < filterSize; j++) {
2047             val += src[srcPos + j] * filter[filterSize * i + j];
2048         }
2049         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
2050         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
2051     }
2052 }
2053
2054 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
2055                            const int16_t *filter,
2056                            const int16_t *filterPos, int filterSize)
2057 {
2058     int i;
2059     const uint16_t *src = (const uint16_t *) _src;
2060     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2061
2062     if(sh<15)
2063         sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2064
2065     for (i = 0; i < dstW; i++) {
2066         int j;
2067         int srcPos = filterPos[i];
2068         int val = 0;
2069
2070         for (j = 0; j < filterSize; j++) {
2071             val += src[srcPos + j] * filter[filterSize * i + j];
2072         }
2073         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2074         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2075     }
2076 }
2077
2078 // bilinear / bicubic scaling
2079 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2080                           const int16_t *filter, const int16_t *filterPos,
2081                           int filterSize)
2082 {
2083     int i;
2084     for (i=0; i<dstW; i++) {
2085         int j;
2086         int srcPos= filterPos[i];
2087         int val=0;
2088         for (j=0; j<filterSize; j++) {
2089             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2090         }
2091         //filter += hFilterSize;
2092         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2093         //dst[i] = val>>7;
2094     }
2095 }
2096
2097 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
2098                           const int16_t *filter, const int16_t *filterPos,
2099                           int filterSize)
2100 {
2101     int i;
2102     int32_t *dst = (int32_t *) _dst;
2103     for (i=0; i<dstW; i++) {
2104         int j;
2105         int srcPos= filterPos[i];
2106         int val=0;
2107         for (j=0; j<filterSize; j++) {
2108             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2109         }
2110         //filter += hFilterSize;
2111         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
2112         //dst[i] = val>>7;
2113     }
2114 }
2115
2116 //FIXME all pal and rgb srcFormats could do this convertion as well
2117 //FIXME all scalers more complex than bilinear could do half of this transform
2118 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2119 {
2120     int i;
2121     for (i = 0; i < width; i++) {
2122         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2123         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2124     }
2125 }
2126 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2127 {
2128     int i;
2129     for (i = 0; i < width; i++) {
2130         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2131         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2132     }
2133 }
2134 static void lumRangeToJpeg_c(int16_t *dst, int width)
2135 {
2136     int i;
2137     for (i = 0; i < width; i++)
2138         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2139 }
2140 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2141 {
2142     int i;
2143     for (i = 0; i < width; i++)
2144         dst[i] = (dst[i]*14071 + 33561947)>>14;
2145 }
2146
2147 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2148 {
2149     int i;
2150     int32_t *dstU = (int32_t *) _dstU;
2151     int32_t *dstV = (int32_t *) _dstV;
2152     for (i = 0; i < width; i++) {
2153         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2154         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2155     }
2156 }
2157 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2158 {
2159     int i;
2160     int32_t *dstU = (int32_t *) _dstU;
2161     int32_t *dstV = (int32_t *) _dstV;
2162     for (i = 0; i < width; i++) {
2163         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2164         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2165     }
2166 }
2167 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2168 {
2169     int i;
2170     int32_t *dst = (int32_t *) _dst;
2171     for (i = 0; i < width; i++)
2172         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2173 }
2174 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2175 {
2176     int i;
2177     int32_t *dst = (int32_t *) _dst;
2178     for (i = 0; i < width; i++)
2179         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2180 }
2181
2182 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2183                            const uint8_t *src, int srcW, int xInc)
2184 {
2185     int i;
2186     unsigned int xpos=0;
2187     for (i=0;i<dstWidth;i++) {
2188         register unsigned int xx=xpos>>16;
2189         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2190         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2191         xpos+=xInc;
2192     }
2193     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2194         dst[i] = src[srcW-1]*128;
2195 }
2196
2197 // *** horizontal scale Y line to temp buffer
2198 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2199                                      const uint8_t *src_in[4], int srcW, int xInc,
2200                                      const int16_t *hLumFilter,
2201                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2202                                      uint8_t *formatConvBuffer,
2203                                      uint32_t *pal, int isAlpha)
2204 {
2205     void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2206     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2207     const uint8_t *src = src_in[isAlpha ? 3 : 0];
2208
2209     if (toYV12) {
2210         toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
2211         src= formatConvBuffer;
2212     } else if (c->readLumPlanar && !isAlpha) {
2213         c->readLumPlanar(formatConvBuffer, src_in, srcW);
2214         src = formatConvBuffer;
2215     }
2216
2217     if (!c->hyscale_fast) {
2218         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2219     } else { // fast bilinear upscale / crap downscale
2220         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2221     }
2222
2223     if (convertRange)
2224         convertRange(dst, dstWidth);
2225 }
2226
2227 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2228                            int dstWidth, const uint8_t *src1,
2229                            const uint8_t *src2, int srcW, int xInc)
2230 {
2231     int i;
2232     unsigned int xpos=0;
2233     for (i=0;i<dstWidth;i++) {
2234         register unsigned int xx=xpos>>16;
2235         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2236         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2237         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2238         xpos+=xInc;
2239     }
2240     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2241         dst1[i] = src1[srcW-1]*128;
2242         dst2[i] = src2[srcW-1]*128;
2243     }
2244 }
2245
2246 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2247                                      const uint8_t *src_in[4],
2248                                      int srcW, int xInc, const int16_t *hChrFilter,
2249                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2250                                      uint8_t *formatConvBuffer, uint32_t *pal)
2251 {
2252     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
2253     if (c->chrToYV12) {
2254         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2255         c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
2256         src1= formatConvBuffer;
2257         src2= buf2;
2258     } else if (c->readChrPlanar) {
2259         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2260         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
2261         src1= formatConvBuffer;
2262         src2= buf2;
2263     }
2264
2265     if (!c->hcscale_fast) {
2266         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2267         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2268     } else { // fast bilinear upscale / crap downscale
2269         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2270     }
2271
2272     if (c->chrConvertRange)
2273         c->chrConvertRange(dst1, dst2, dstWidth);
2274 }
2275
2276 static av_always_inline void
2277 find_c_packed_planar_out_funcs(SwsContext *c,
2278                                yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX,
2279                                yuv2interleavedX_fn *yuv2nv12cX,
2280                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2281                                yuv2packedX_fn *yuv2packedX)
2282 {
2283     enum PixelFormat dstFormat = c->dstFormat;
2284
2285     if (is16BPS(dstFormat)) {
2286         *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c  : yuv2planeX_16LE_c;
2287         *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c  : yuv2plane1_16LE_c;
2288     } else if (is9_OR_10BPS(dstFormat)) {
2289         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2290             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c  : yuv2planeX_9LE_c;
2291             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c  : yuv2plane1_9LE_c;
2292         } else {
2293             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c  : yuv2planeX_10LE_c;
2294             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c  : yuv2plane1_10LE_c;
2295         }
2296     } else {
2297         *yuv2plane1 = yuv2plane1_8_c;
2298         *yuv2planeX = yuv2planeX_8_c;
2299         if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)
2300             *yuv2nv12cX = yuv2nv12cX_c;
2301     }
2302
2303     if(c->flags & SWS_FULL_CHR_H_INT) {
2304         switch (dstFormat) {
2305             case PIX_FMT_RGBA:
2306 #if CONFIG_SMALL
2307                 *yuv2packedX = yuv2rgba32_full_X_c;
2308 #else
2309 #if CONFIG_SWSCALE_ALPHA
2310                 if (c->alpPixBuf) {
2311                     *yuv2packedX = yuv2rgba32_full_X_c;
2312                 } else
2313 #endif /* CONFIG_SWSCALE_ALPHA */
2314                 {
2315                     *yuv2packedX = yuv2rgbx32_full_X_c;
2316                 }
2317 #endif /* !CONFIG_SMALL */
2318                 break;
2319             case PIX_FMT_ARGB:
2320 #if CONFIG_SMALL
2321                 *yuv2packedX = yuv2argb32_full_X_c;
2322 #else
2323 #if CONFIG_SWSCALE_ALPHA
2324                 if (c->alpPixBuf) {
2325                     *yuv2packedX = yuv2argb32_full_X_c;
2326                 } else
2327 #endif /* CONFIG_SWSCALE_ALPHA */
2328                 {
2329                     *yuv2packedX = yuv2xrgb32_full_X_c;
2330                 }
2331 #endif /* !CONFIG_SMALL */
2332                 break;
2333             case PIX_FMT_BGRA:
2334 #if CONFIG_SMALL
2335                 *yuv2packedX = yuv2bgra32_full_X_c;
2336 #else
2337 #if CONFIG_SWSCALE_ALPHA
2338                 if (c->alpPixBuf) {
2339                     *yuv2packedX = yuv2bgra32_full_X_c;
2340                 } else
2341 #endif /* CONFIG_SWSCALE_ALPHA */
2342                 {
2343                     *yuv2packedX = yuv2bgrx32_full_X_c;
2344                 }
2345 #endif /* !CONFIG_SMALL */
2346                 break;
2347             case PIX_FMT_ABGR:
2348 #if CONFIG_SMALL
2349                 *yuv2packedX = yuv2abgr32_full_X_c;
2350 #else
2351 #if CONFIG_SWSCALE_ALPHA
2352                 if (c->alpPixBuf) {
2353                     *yuv2packedX = yuv2abgr32_full_X_c;
2354                 } else
2355 #endif /* CONFIG_SWSCALE_ALPHA */
2356                 {
2357                     *yuv2packedX = yuv2xbgr32_full_X_c;
2358                 }
2359 #endif /* !CONFIG_SMALL */
2360                 break;
2361             case PIX_FMT_RGB24:
2362             *yuv2packedX = yuv2rgb24_full_X_c;
2363             break;
2364         case PIX_FMT_BGR24:
2365             *yuv2packedX = yuv2bgr24_full_X_c;
2366             break;
2367         }
2368         if(!*yuv2packedX)
2369             goto YUV_PACKED;
2370     } else {
2371         YUV_PACKED:
2372         switch (dstFormat) {
2373         case PIX_FMT_GRAY16BE:
2374             *yuv2packed1 = yuv2gray16BE_1_c;
2375             *yuv2packed2 = yuv2gray16BE_2_c;
2376             *yuv2packedX = yuv2gray16BE_X_c;
2377             break;
2378         case PIX_FMT_GRAY16LE:
2379             *yuv2packed1 = yuv2gray16LE_1_c;
2380             *yuv2packed2 = yuv2gray16LE_2_c;
2381             *yuv2packedX = yuv2gray16LE_X_c;
2382             break;
2383         case PIX_FMT_MONOWHITE:
2384             *yuv2packed1 = yuv2monowhite_1_c;
2385             *yuv2packed2 = yuv2monowhite_2_c;
2386             *yuv2packedX = yuv2monowhite_X_c;
2387             break;
2388         case PIX_FMT_MONOBLACK:
2389             *yuv2packed1 = yuv2monoblack_1_c;
2390             *yuv2packed2 = yuv2monoblack_2_c;
2391             *yuv2packedX = yuv2monoblack_X_c;
2392             break;
2393         case PIX_FMT_YUYV422:
2394             *yuv2packed1 = yuv2yuyv422_1_c;
2395             *yuv2packed2 = yuv2yuyv422_2_c;
2396             *yuv2packedX = yuv2yuyv422_X_c;
2397             break;
2398         case PIX_FMT_UYVY422:
2399             *yuv2packed1 = yuv2uyvy422_1_c;
2400             *yuv2packed2 = yuv2uyvy422_2_c;
2401             *yuv2packedX = yuv2uyvy422_X_c;
2402             break;
2403         case PIX_FMT_RGB48LE:
2404             *yuv2packed1 = yuv2rgb48le_1_c;
2405             *yuv2packed2 = yuv2rgb48le_2_c;
2406             *yuv2packedX = yuv2rgb48le_X_c;
2407             break;
2408         case PIX_FMT_RGB48BE:
2409             *yuv2packed1 = yuv2rgb48be_1_c;
2410             *yuv2packed2 = yuv2rgb48be_2_c;
2411             *yuv2packedX = yuv2rgb48be_X_c;
2412             break;
2413         case PIX_FMT_BGR48LE:
2414             *yuv2packed1 = yuv2bgr48le_1_c;
2415             *yuv2packed2 = yuv2bgr48le_2_c;
2416             *yuv2packedX = yuv2bgr48le_X_c;
2417             break;
2418         case PIX_FMT_BGR48BE:
2419             *yuv2packed1 = yuv2bgr48be_1_c;
2420             *yuv2packed2 = yuv2bgr48be_2_c;
2421             *yuv2packedX = yuv2bgr48be_X_c;
2422             break;
2423         case PIX_FMT_RGB32:
2424         case PIX_FMT_BGR32:
2425 #if CONFIG_SMALL
2426             *yuv2packed1 = yuv2rgb32_1_c;
2427             *yuv2packed2 = yuv2rgb32_2_c;
2428             *yuv2packedX = yuv2rgb32_X_c;
2429 #else
2430 #if CONFIG_SWSCALE_ALPHA
2431                 if (c->alpPixBuf) {
2432                     *yuv2packed1 = yuv2rgba32_1_c;
2433                     *yuv2packed2 = yuv2rgba32_2_c;
2434                     *yuv2packedX = yuv2rgba32_X_c;
2435                 } else
2436 #endif /* CONFIG_SWSCALE_ALPHA */
2437                 {
2438                     *yuv2packed1 = yuv2rgbx32_1_c;
2439                     *yuv2packed2 = yuv2rgbx32_2_c;
2440                     *yuv2packedX = yuv2rgbx32_X_c;
2441                 }
2442 #endif /* !CONFIG_SMALL */
2443             break;
2444         case PIX_FMT_RGB32_1:
2445         case PIX_FMT_BGR32_1:
2446 #if CONFIG_SMALL
2447                 *yuv2packed1 = yuv2rgb32_1_1_c;
2448                 *yuv2packed2 = yuv2rgb32_1_2_c;
2449                 *yuv2packedX = yuv2rgb32_1_X_c;
2450 #else
2451 #if CONFIG_SWSCALE_ALPHA
2452                 if (c->alpPixBuf) {
2453                     *yuv2packed1 = yuv2rgba32_1_1_c;
2454                     *yuv2packed2 = yuv2rgba32_1_2_c;
2455                     *yuv2packedX = yuv2rgba32_1_X_c;
2456                 } else
2457 #endif /* CONFIG_SWSCALE_ALPHA */
2458                 {
2459                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2460                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2461                     *yuv2packedX = yuv2rgbx32_1_X_c;
2462                 }
2463 #endif /* !CONFIG_SMALL */
2464                 break;
2465         case PIX_FMT_RGB24:
2466             *yuv2packed1 = yuv2rgb24_1_c;
2467             *yuv2packed2 = yuv2rgb24_2_c;
2468             *yuv2packedX = yuv2rgb24_X_c;
2469             break;
2470         case PIX_FMT_BGR24:
2471             *yuv2packed1 = yuv2bgr24_1_c;
2472             *yuv2packed2 = yuv2bgr24_2_c;
2473             *yuv2packedX = yuv2bgr24_X_c;
2474             break;
2475         case PIX_FMT_RGB565LE:
2476         case PIX_FMT_RGB565BE:
2477         case PIX_FMT_BGR565LE:
2478         case PIX_FMT_BGR565BE:
2479             *yuv2packed1 = yuv2rgb16_1_c;
2480             *yuv2packed2 = yuv2rgb16_2_c;
2481             *yuv2packedX = yuv2rgb16_X_c;
2482             break;
2483         case PIX_FMT_RGB555LE:
2484         case PIX_FMT_RGB555BE:
2485         case PIX_FMT_BGR555LE:
2486         case PIX_FMT_BGR555BE:
2487             *yuv2packed1 = yuv2rgb15_1_c;
2488             *yuv2packed2 = yuv2rgb15_2_c;
2489             *yuv2packedX = yuv2rgb15_X_c;
2490             break;
2491         case PIX_FMT_RGB444LE:
2492         case PIX_FMT_RGB444BE:
2493         case PIX_FMT_BGR444LE:
2494         case PIX_FMT_BGR444BE:
2495             *yuv2packed1 = yuv2rgb12_1_c;
2496             *yuv2packed2 = yuv2rgb12_2_c;
2497             *yuv2packedX = yuv2rgb12_X_c;
2498             break;
2499         case PIX_FMT_RGB8:
2500         case PIX_FMT_BGR8:
2501             *yuv2packed1 = yuv2rgb8_1_c;
2502             *yuv2packed2 = yuv2rgb8_2_c;
2503             *yuv2packedX = yuv2rgb8_X_c;
2504             break;
2505         case PIX_FMT_RGB4:
2506         case PIX_FMT_BGR4:
2507             *yuv2packed1 = yuv2rgb4_1_c;
2508             *yuv2packed2 = yuv2rgb4_2_c;
2509             *yuv2packedX = yuv2rgb4_X_c;
2510             break;
2511         case PIX_FMT_RGB4_BYTE:
2512         case PIX_FMT_BGR4_BYTE:
2513             *yuv2packed1 = yuv2rgb4b_1_c;
2514             *yuv2packed2 = yuv2rgb4b_2_c;
2515             *yuv2packedX = yuv2rgb4b_X_c;
2516             break;
2517         }
2518     }
2519 }
2520
2521 #define DEBUG_SWSCALE_BUFFERS 0
2522 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2523
2524 static int swScale(SwsContext *c, const uint8_t* src[],
2525                    int srcStride[], int srcSliceY,
2526                    int srcSliceH, uint8_t* dst[], int dstStride[])
2527 {
2528     /* load a few things into local vars to make the code more readable? and faster */
2529     const int srcW= c->srcW;
2530     const int dstW= c->dstW;
2531     const int dstH= c->dstH;
2532     const int chrDstW= c->chrDstW;
2533     const int chrSrcW= c->chrSrcW;
2534     const int lumXInc= c->lumXInc;
2535     const int chrXInc= c->chrXInc;
2536     const enum PixelFormat dstFormat= c->dstFormat;
2537     const int flags= c->flags;
2538     int16_t *vLumFilterPos= c->vLumFilterPos;
2539     int16_t *vChrFilterPos= c->vChrFilterPos;
2540     int16_t *hLumFilterPos= c->hLumFilterPos;
2541     int16_t *hChrFilterPos= c->hChrFilterPos;
2542     int16_t *hLumFilter= c->hLumFilter;
2543     int16_t *hChrFilter= c->hChrFilter;
2544     int32_t *lumMmxFilter= c->lumMmxFilter;
2545     int32_t *chrMmxFilter= c->chrMmxFilter;
2546     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2547     const int vLumFilterSize= c->vLumFilterSize;
2548     const int vChrFilterSize= c->vChrFilterSize;
2549     const int hLumFilterSize= c->hLumFilterSize;
2550     const int hChrFilterSize= c->hChrFilterSize;
2551     int16_t **lumPixBuf= c->lumPixBuf;
2552     int16_t **chrUPixBuf= c->chrUPixBuf;
2553     int16_t **chrVPixBuf= c->chrVPixBuf;
2554     int16_t **alpPixBuf= c->alpPixBuf;
2555     const int vLumBufSize= c->vLumBufSize;
2556     const int vChrBufSize= c->vChrBufSize;
2557     uint8_t *formatConvBuffer= c->formatConvBuffer;
2558     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2559     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2560     int lastDstY;
2561     uint32_t *pal=c->pal_yuv;
2562     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2563
2564     yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
2565     yuv2planarX_fn yuv2planeX = c->yuv2planeX;
2566     yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
2567     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2568     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2569     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2570
2571     /* vars which will change and which we need to store back in the context */
2572     int dstY= c->dstY;
2573     int lumBufIndex= c->lumBufIndex;
2574     int chrBufIndex= c->chrBufIndex;
2575     int lastInLumBuf= c->lastInLumBuf;
2576     int lastInChrBuf= c->lastInChrBuf;
2577
2578     if (isPacked(c->srcFormat)) {
2579         src[0]=
2580         src[1]=
2581         src[2]=
2582         src[3]= src[0];
2583         srcStride[0]=
2584         srcStride[1]=
2585         srcStride[2]=
2586         srcStride[3]= srcStride[0];
2587     }
2588     srcStride[1]<<= c->vChrDrop;
2589     srcStride[2]<<= c->vChrDrop;
2590
2591     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2592                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2593                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2594     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2595                    srcSliceY,    srcSliceH,    dstY,    dstH);
2596     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2597                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2598
2599     if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 || dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
2600         static int warnedAlready=0; //FIXME move this into the context perhaps
2601         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2602             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2603                    "         ->cannot do aligned memory accesses anymore\n");
2604             warnedAlready=1;
2605         }
2606     }
2607
2608     if ((int)dst[0]%16 || (int)dst[1]%16 || (int)dst[2]%16 || (int)src[0]%16 || (int)src[1]%16 || (int)src[2]%16
2609         || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
2610         || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
2611     ) {
2612         static int warnedAlready=0;
2613         int cpu_flags = av_get_cpu_flags();
2614         if (HAVE_MMX2 && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
2615             av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
2616             warnedAlready=1;
2617         }
2618     }
2619
2620     /* Note the user might start scaling the picture in the middle so this
2621        will not get executed. This is not really intended but works
2622        currently, so people might do it. */
2623     if (srcSliceY ==0) {
2624         lumBufIndex=-1;
2625         chrBufIndex=-1;
2626         dstY=0;
2627         lastInLumBuf= -1;
2628         lastInChrBuf= -1;
2629     }
2630
2631     if (!should_dither) {
2632         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2633     }
2634     lastDstY= dstY;
2635
2636     for (;dstY < dstH; dstY++) {
2637         const int chrDstY= dstY>>c->chrDstVSubSample;
2638         uint8_t *dest[4] = {
2639             dst[0] + dstStride[0] * dstY,
2640             dst[1] + dstStride[1] * chrDstY,
2641             dst[2] + dstStride[2] * chrDstY,
2642             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2643         };
2644         int use_mmx_vfilter= c->use_mmx_vfilter;
2645
2646         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2647         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2648         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2649         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2650         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2651         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2652         int enough_lines;
2653
2654         //handle holes (FAST_BILINEAR & weird filters)
2655         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2656         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2657         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2658         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2659
2660         DEBUG_BUFFERS("dstY: %d\n", dstY);
2661         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2662                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2663         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2664                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2665
2666         // Do we have enough lines in this slice to output the dstY line
2667         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2668
2669         if (!enough_lines) {
2670             lastLumSrcY = srcSliceY + srcSliceH - 1;
2671             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2672             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2673                                             lastLumSrcY, lastChrSrcY);
2674         }
2675
2676         //Do horizontal scaling
2677         while(lastInLumBuf < lastLumSrcY) {
2678             const uint8_t *src1[4] = {
2679                 src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
2680                 src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
2681                 src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
2682                 src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
2683             };
2684             lumBufIndex++;
2685             assert(lumBufIndex < 2*vLumBufSize);
2686             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2687             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2688             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2689                     hLumFilter, hLumFilterPos, hLumFilterSize,
2690                     formatConvBuffer,
2691                     pal, 0);
2692             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2693                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src1, srcW,
2694                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2695                         formatConvBuffer,
2696                         pal, 1);
2697             lastInLumBuf++;
2698             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2699                                lumBufIndex,    lastInLumBuf);
2700         }
2701         while(lastInChrBuf < lastChrSrcY) {
2702             const uint8_t *src1[4] = {
2703                 src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
2704                 src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
2705                 src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
2706                 src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
2707             };
2708             chrBufIndex++;
2709             assert(chrBufIndex < 2*vChrBufSize);
2710             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2711             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2712             //FIXME replace parameters through context struct (some at least)
2713
2714             if (c->needs_hcscale)
2715                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2716                           chrDstW, src1, chrSrcW, chrXInc,
2717                           hChrFilter, hChrFilterPos, hChrFilterSize,
2718                           formatConvBuffer, pal);
2719             lastInChrBuf++;
2720             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2721                                chrBufIndex,    lastInChrBuf);
2722         }
2723         //wrap buf index around to stay inside the ring buffer
2724         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2725         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2726         if (!enough_lines)
2727             break; //we can't output a dstY line so let's try with the next slice
2728
2729 #if HAVE_MMX
2730         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2731 #endif
2732         if (should_dither) {
2733             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2734             c->lumDither8 = dither_8x8_128[dstY & 7];
2735         }
2736         if (dstY >= dstH-2) {
2737             // hmm looks like we can't use MMX here without overwriting this array's tail
2738             find_c_packed_planar_out_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
2739                                            &yuv2packed1, &yuv2packed2, &yuv2packedX);
2740             use_mmx_vfilter= 0;
2741         }
2742
2743         {
2744             const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2745             const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2746             const int16_t **chrVSrcPtr= (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2747             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2748             int16_t *vLumFilter= c->vLumFilter;
2749             int16_t *vChrFilter= c->vChrFilter;
2750
2751             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2752                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2753
2754                 vLumFilter +=    dstY * vLumFilterSize;
2755                 vChrFilter += chrDstY * vChrFilterSize;
2756
2757                 av_assert0(use_mmx_vfilter != (
2758                                yuv2planeX == yuv2planeX_10BE_c
2759                             || yuv2planeX == yuv2planeX_10LE_c
2760                             || yuv2planeX == yuv2planeX_9BE_c
2761                             || yuv2planeX == yuv2planeX_9LE_c
2762                             || yuv2planeX == yuv2planeX_16BE_c
2763                             || yuv2planeX == yuv2planeX_16LE_c
2764                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
2765
2766                 if(use_mmx_vfilter){
2767                     vLumFilter= c->lumMmxFilter;
2768                     vChrFilter= c->chrMmxFilter;
2769                 }
2770
2771                 if (vLumFilterSize == 1) {
2772                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
2773                 } else {
2774                     yuv2planeX(vLumFilter, vLumFilterSize,
2775                                lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
2776                 }
2777
2778                 if (!((dstY&chrSkipMask) || isGray(dstFormat))) {
2779                     if (yuv2nv12cX) {
2780                         yuv2nv12cX(c, vChrFilter, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
2781                     } else if (vChrFilterSize == 1) {
2782                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
2783                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
2784                     } else {
2785                         yuv2planeX(vChrFilter, vChrFilterSize,
2786                                    chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
2787                         yuv2planeX(vChrFilter, vChrFilterSize,
2788                                    chrVSrcPtr, dest[2], chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
2789                     }
2790                 }
2791
2792                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2793                     if(use_mmx_vfilter){
2794                         vLumFilter= c->alpMmxFilter;
2795                     }
2796                     if (vLumFilterSize == 1) {
2797                         yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0);
2798                     } else {
2799                         yuv2planeX(vLumFilter, vLumFilterSize,
2800                                    alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
2801                     }
2802                 }
2803             } else {
2804                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2805                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2806                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2807                     int chrAlpha = vChrFilter[2 * dstY + 1];
2808                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2809                                 alpPixBuf ? *alpSrcPtr : NULL,
2810                                 dest[0], dstW, chrAlpha, dstY);
2811                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2812                     int lumAlpha = vLumFilter[2 * dstY + 1];
2813                     int chrAlpha = vChrFilter[2 * dstY + 1];
2814                     lumMmxFilter[2] =
2815                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2816                     chrMmxFilter[2] =
2817                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2818                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2819                                 alpPixBuf ? alpSrcPtr : NULL,
2820                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2821                 } else { //general RGB
2822                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2823                                 lumSrcPtr, vLumFilterSize,
2824                                 vChrFilter + dstY * vChrFilterSize,
2825                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2826                                 alpSrcPtr, dest[0], dstW, dstY);
2827                 }
2828             }
2829         }
2830     }
2831
2832     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2833         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2834
2835 #if HAVE_MMX2
2836     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2837         __asm__ volatile("sfence":::"memory");
2838 #endif
2839     emms_c();
2840
2841     /* store changed local vars back in the context */
2842     c->dstY= dstY;
2843     c->lumBufIndex= lumBufIndex;
2844     c->chrBufIndex= chrBufIndex;
2845     c->lastInLumBuf= lastInLumBuf;
2846     c->lastInChrBuf= lastInChrBuf;
2847
2848     return dstY - lastDstY;
2849 }
2850
2851 static av_cold void sws_init_swScale_c(SwsContext *c)
2852 {
2853     enum PixelFormat srcFormat = c->srcFormat;
2854
2855     find_c_packed_planar_out_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
2856                                    &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2,
2857                                    &c->yuv2packedX);
2858
2859     c->chrToYV12 = NULL;
2860     switch(srcFormat) {
2861         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2862         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2863         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2864         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2865         case PIX_FMT_RGB8     :
2866         case PIX_FMT_BGR8     :
2867         case PIX_FMT_PAL8     :
2868         case PIX_FMT_BGR4_BYTE:
2869         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2870         case PIX_FMT_GBRP9LE:
2871         case PIX_FMT_GBRP10LE:
2872         case PIX_FMT_GBRP16LE:  c->readChrPlanar = planar_rgb16le_to_uv; break;
2873         case PIX_FMT_GBRP9BE:
2874         case PIX_FMT_GBRP10BE:
2875         case PIX_FMT_GBRP16BE:  c->readChrPlanar = planar_rgb16be_to_uv; break;
2876         case PIX_FMT_GBRP:      c->readChrPlanar = planar_rgb_to_uv; break;
2877 #if HAVE_BIGENDIAN
2878         case PIX_FMT_YUV444P9LE:
2879         case PIX_FMT_YUV422P9LE:
2880         case PIX_FMT_YUV420P9LE:
2881         case PIX_FMT_YUV422P10LE:
2882         case PIX_FMT_YUV420P10LE:
2883         case PIX_FMT_YUV444P10LE:
2884         case PIX_FMT_YUV420P16LE:
2885         case PIX_FMT_YUV422P16LE:
2886         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2887 #else
2888         case PIX_FMT_YUV444P9BE:
2889         case PIX_FMT_YUV422P9BE:
2890         case PIX_FMT_YUV420P9BE:
2891         case PIX_FMT_YUV444P10BE:
2892         case PIX_FMT_YUV422P10BE:
2893         case PIX_FMT_YUV420P10BE:
2894         case PIX_FMT_YUV420P16BE:
2895         case PIX_FMT_YUV422P16BE:
2896         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2897 #endif
2898     }
2899     if (c->chrSrcHSubSample) {
2900         switch(srcFormat) {
2901         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2902         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2903         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2904         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2905         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2906         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2907         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2908         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2909         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2910         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2911         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2912         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2913         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2914         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2915         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2916         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2917         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2918         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2919         case PIX_FMT_GBR24P  : c->chrToYV12 = gbr24pToUV_half_c;  break;
2920         }
2921     } else {
2922         switch(srcFormat) {
2923         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2924         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2925         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2926         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2927         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2928         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2929         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2930         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2931         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2932         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2933         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2934         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2935         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2936         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2937         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2938         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2939         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2940         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2941         }
2942     }
2943
2944     c->lumToYV12 = NULL;
2945     c->alpToYV12 = NULL;
2946     switch (srcFormat) {
2947     case PIX_FMT_GBRP9LE:
2948     case PIX_FMT_GBRP10LE:
2949     case PIX_FMT_GBRP16LE: c->readLumPlanar = planar_rgb16le_to_y; break;
2950     case PIX_FMT_GBRP9BE:
2951     case PIX_FMT_GBRP10BE:
2952     case PIX_FMT_GBRP16BE: c->readLumPlanar = planar_rgb16be_to_y; break;
2953     case PIX_FMT_GBRP:     c->readLumPlanar = planar_rgb_to_y; break;
2954 #if HAVE_BIGENDIAN
2955     case PIX_FMT_YUV444P9LE:
2956     case PIX_FMT_YUV422P9LE:
2957     case PIX_FMT_YUV420P9LE:
2958     case PIX_FMT_YUV422P10LE:
2959     case PIX_FMT_YUV420P10LE:
2960     case PIX_FMT_YUV444P10LE:
2961     case PIX_FMT_YUV420P16LE:
2962     case PIX_FMT_YUV422P16LE:
2963     case PIX_FMT_YUV444P16LE:
2964     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2965 #else
2966     case PIX_FMT_YUV444P9BE:
2967     case PIX_FMT_YUV422P9BE:
2968     case PIX_FMT_YUV420P9BE:
2969     case PIX_FMT_YUV444P10BE:
2970     case PIX_FMT_YUV422P10BE:
2971     case PIX_FMT_YUV420P10BE:
2972     case PIX_FMT_YUV420P16BE:
2973     case PIX_FMT_YUV422P16BE:
2974     case PIX_FMT_YUV444P16BE:
2975     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2976 #endif
2977     case PIX_FMT_YUYV422  :
2978     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2979     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2980     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2981     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2982     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2983     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2984     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2985     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2986     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2987     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2988     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2989     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2990     case PIX_FMT_RGB8     :
2991     case PIX_FMT_BGR8     :
2992     case PIX_FMT_PAL8     :
2993     case PIX_FMT_BGR4_BYTE:
2994     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2995     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2996     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2997     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2998     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2999     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
3000     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
3001     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
3002     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
3003     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
3004     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
3005     }
3006     if (c->alpPixBuf) {
3007         switch (srcFormat) {
3008         case PIX_FMT_BGRA:
3009         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
3010         case PIX_FMT_ABGR:
3011         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
3012         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
3013         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
3014         }
3015     }
3016
3017
3018     if (c->srcBpc == 8) {
3019         if (c->dstBpc <= 10) {
3020             c->hyScale = c->hcScale = hScale8To15_c;
3021             if (c->flags & SWS_FAST_BILINEAR) {
3022                 c->hyscale_fast = hyscale_fast_c;
3023                 c->hcscale_fast = hcscale_fast_c;
3024             }
3025         } else {
3026             c->hyScale = c->hcScale = hScale8To19_c;
3027         }
3028     } else {
3029         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
3030     }
3031
3032     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3033         if (c->dstBpc <= 10) {
3034             if (c->srcRange) {
3035                 c->lumConvertRange = lumRangeFromJpeg_c;
3036                 c->chrConvertRange = chrRangeFromJpeg_c;
3037             } else {
3038                 c->lumConvertRange = lumRangeToJpeg_c;
3039                 c->chrConvertRange = chrRangeToJpeg_c;
3040             }
3041         } else {
3042             if (c->srcRange) {
3043                 c->lumConvertRange = lumRangeFromJpeg16_c;
3044                 c->chrConvertRange = chrRangeFromJpeg16_c;
3045             } else {
3046                 c->lumConvertRange = lumRangeToJpeg16_c;
3047                 c->chrConvertRange = chrRangeToJpeg16_c;
3048             }
3049         }
3050     }
3051
3052     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3053           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3054         c->needs_hcscale = 1;
3055 }
3056
3057 SwsFunc ff_getSwsFunc(SwsContext *c)
3058 {
3059     sws_init_swScale_c(c);
3060
3061     if (HAVE_MMX)
3062         ff_sws_init_swScale_mmx(c);
3063     if (HAVE_ALTIVEC)
3064         ff_sws_init_swScale_altivec(c);
3065
3066     return swScale;
3067 }