git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/avassert.h"
  64 #include "libavutil/intreadwrite.h"
  65 #include "libavutil/cpu.h"
  66 #include "libavutil/avutil.h"
  67 #include "libavutil/mathematics.h"
  68 #include "libavutil/bswap.h"
  69 #include "libavutil/pixdesc.h"
  70
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 186 {  36, 68, 60, 92, 34, 66, 58, 90,},
 187 { 100,  4,124, 28, 98,  2,122, 26,},
 188 {  52, 84, 44, 76, 50, 82, 42, 74,},
 189 { 116, 20,108, 12,114, 18,106, 10,},
 190 {  32, 64, 56, 88, 38, 70, 62, 94,},
 191 {  96,  0,120, 24,102,  6,126, 30,},
 192 {  48, 80, 40, 72, 54, 86, 46, 78,},
 193 { 112, 16,104,  8,118, 22,110, 14,},
 194 };
 195 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 196 {  64, 64, 64, 64, 64, 64, 64, 64 };
 197
 198 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 199 {
 200   {   0,  1,  0,  1,  0,  1,  0,  1,},
 201   {   1,  0,  1,  0,  1,  0,  1,  0,},
 202   {   0,  1,  0,  1,  0,  1,  0,  1,},
 203   {   1,  0,  1,  0,  1,  0,  1,  0,},
 204   {   0,  1,  0,  1,  0,  1,  0,  1,},
 205   {   1,  0,  1,  0,  1,  0,  1,  0,},
 206   {   0,  1,  0,  1,  0,  1,  0,  1,},
 207   {   1,  0,  1,  0,  1,  0,  1,  0,},
 208 },{
 209   {   1,  2,  1,  2,  1,  2,  1,  2,},
 210   {   3,  0,  3,  0,  3,  0,  3,  0,},
 211   {   1,  2,  1,  2,  1,  2,  1,  2,},
 212   {   3,  0,  3,  0,  3,  0,  3,  0,},
 213   {   1,  2,  1,  2,  1,  2,  1,  2,},
 214   {   3,  0,  3,  0,  3,  0,  3,  0,},
 215   {   1,  2,  1,  2,  1,  2,  1,  2,},
 216   {   3,  0,  3,  0,  3,  0,  3,  0,},
 217 },{
 218   {   2,  4,  3,  5,  2,  4,  3,  5,},
 219   {   6,  0,  7,  1,  6,  0,  7,  1,},
 220   {   3,  5,  2,  4,  3,  5,  2,  4,},
 221   {   7,  1,  6,  0,  7,  1,  6,  0,},
 222   {   2,  4,  3,  5,  2,  4,  3,  5,},
 223   {   6,  0,  7,  1,  6,  0,  7,  1,},
 224   {   3,  5,  2,  4,  3,  5,  2,  4,},
 225   {   7,  1,  6,  0,  7,  1,  6,  0,},
 226 },{
 227   {   4,  8,  7, 11,  4,  8,  7, 11,},
 228   {  12,  0, 15,  3, 12,  0, 15,  3,},
 229   {   6, 10,  5,  9,  6, 10,  5,  9,},
 230   {  14,  2, 13,  1, 14,  2, 13,  1,},
 231   {   4,  8,  7, 11,  4,  8,  7, 11,},
 232   {  12,  0, 15,  3, 12,  0, 15,  3,},
 233   {   6, 10,  5,  9,  6, 10,  5,  9,},
 234   {  14,  2, 13,  1, 14,  2, 13,  1,},
 235 },{
 236   {   9, 17, 15, 23,  8, 16, 14, 22,},
 237   {  25,  1, 31,  7, 24,  0, 30,  6,},
 238   {  13, 21, 11, 19, 12, 20, 10, 18,},
 239   {  29,  5, 27,  3, 28,  4, 26,  2,},
 240   {   8, 16, 14, 22,  9, 17, 15, 23,},
 241   {  24,  0, 30,  6, 25,  1, 31,  7,},
 242   {  12, 20, 10, 18, 13, 21, 11, 19,},
 243   {  28,  4, 26,  2, 29,  5, 27,  3,},
 244 },{
 245   {  18, 34, 30, 46, 17, 33, 29, 45,},
 246   {  50,  2, 62, 14, 49,  1, 61, 13,},
 247   {  26, 42, 22, 38, 25, 41, 21, 37,},
 248   {  58, 10, 54,  6, 57,  9, 53,  5,},
 249   {  16, 32, 28, 44, 19, 35, 31, 47,},
 250   {  48,  0, 60, 12, 51,  3, 63, 15,},
 251   {  24, 40, 20, 36, 27, 43, 23, 39,},
 252   {  56,  8, 52,  4, 59, 11, 55,  7,},
 253 },{
 254   {  18, 34, 30, 46, 17, 33, 29, 45,},
 255   {  50,  2, 62, 14, 49,  1, 61, 13,},
 256   {  26, 42, 22, 38, 25, 41, 21, 37,},
 257   {  58, 10, 54,  6, 57,  9, 53,  5,},
 258   {  16, 32, 28, 44, 19, 35, 31, 47,},
 259   {  48,  0, 60, 12, 51,  3, 63, 15,},
 260   {  24, 40, 20, 36, 27, 43, 23, 39,},
 261   {  56,  8, 52,  4, 59, 11, 55,  7,},
 262 },{
 263   {  36, 68, 60, 92, 34, 66, 58, 90,},
 264   { 100,  4,124, 28, 98,  2,122, 26,},
 265   {  52, 84, 44, 76, 50, 82, 42, 74,},
 266   { 116, 20,108, 12,114, 18,106, 10,},
 267   {  32, 64, 56, 88, 38, 70, 62, 94,},
 268   {  96,  0,120, 24,102,  6,126, 30,},
 269   {  48, 80, 40, 72, 54, 86, 46, 78,},
 270   { 112, 16,104,  8,118, 22,110, 14,},
 271 }};
 272
 273 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 274
 275 const uint16_t dither_scale[15][16]={
 276 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 277 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 278 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 279 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 280 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 281 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 282 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 283 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 284 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 285 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 286 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 287 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 288 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 289 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 290 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 291 };
 292
 293 #define output_pixel(pos, val, bias, signedness) \
 294     if (big_endian) { \
 295         AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 296     } else { \
 297         AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 298     }
 299
 300 static av_always_inline void
 301 yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
 302                          int big_endian, int output_bits)
 303 {
 304     int i;
 305     int shift = 3;
 306     av_assert0(output_bits == 16);
 307
 308     for (i = 0; i < dstW; i++) {
 309         int val = src[i] + (1 << (shift - 1));
 310         output_pixel(&dest[i], val, 0, uint);
 311     }
 312 }
 313
 314 static av_always_inline void
 315 yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
 316                          const int32_t **src, uint16_t *dest, int dstW,
 317                          int big_endian, int output_bits)
 318 {
 319     int i;
 320     int shift = 15;
 321     av_assert0(output_bits == 16);
 322
 323     for (i = 0; i < dstW; i++) {
 324         int val = 1 << (shift - 1);
 325         int j;
 326
 327         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
 328          * filters (or anything with negative coeffs, the range can be slightly
 329          * wider in both directions. To account for this overflow, we subtract
 330          * a constant so it always fits in the signed range (assuming a
 331          * reasonable filterSize), and re-add that at the end. */
 332         val -= 0x40000000;
 333         for (j = 0; j < filterSize; j++)
 334             val += src[j][i] * filter[j];
 335
 336         output_pixel(&dest[i], val, 0x8000, int);
 337     }
 338 }
 339
 340 #undef output_pixel
 341
 342 #define output_pixel(pos, val) \
 343     if (big_endian) { \
 344         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 345     } else { \
 346         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 347     }
 348
 349 static av_always_inline void
 350 yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
 351                          int big_endian, int output_bits)
 352 {
 353     int i;
 354     int shift = 15 - output_bits;
 355
 356     for (i = 0; i < dstW; i++) {
 357         int val = src[i] + (1 << (shift - 1));
 358         output_pixel(&dest[i], val);
 359     }
 360 }
 361
 362 static av_always_inline void
 363 yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
 364                          const int16_t **src, uint16_t *dest, int dstW,
 365                          int big_endian, int output_bits)
 366 {
 367     int i;
 368     int shift = 11 + 16 - output_bits;
 369
 370     for (i = 0; i < dstW; i++) {
 371         int val = 1 << (shift - 1);
 372         int j;
 373
 374         for (j = 0; j < filterSize; j++)
 375             val += src[j][i] * filter[j];
 376
 377         output_pixel(&dest[i], val);
 378     }
 379 }
 380
 381 #undef output_pixel
 382
 383 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
 384 static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
 385                               uint8_t *dest, int dstW, \
 386                               const uint8_t *dither, int offset)\
 387 { \
 388     yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
 389                          (uint16_t *) dest, dstW, is_be, bits); \
 390 }\
 391 static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
 392                               const int16_t **src, uint8_t *dest, int dstW, \
 393                               const uint8_t *dither, int offset)\
 394 { \
 395     yuv2planeX_## template_size ## _c_template(filter, \
 396                          filterSize, (const typeX_t **) src, \
 397                          (uint16_t *) dest, dstW, is_be, bits); \
 398 }
 399 yuv2NBPS( 9, BE, 1, 10, int16_t)
 400 yuv2NBPS( 9, LE, 0, 10, int16_t)
 401 yuv2NBPS(10, BE, 1, 10, int16_t)
 402 yuv2NBPS(10, LE, 0, 10, int16_t)
 403 yuv2NBPS(16, BE, 1, 16, int32_t)
 404 yuv2NBPS(16, LE, 0, 16, int32_t)
 405
 406 static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
 407                            const int16_t **src, uint8_t *dest, int dstW,
 408                            const uint8_t *dither, int offset)
 409 {
 410     int i;
 411     for (i=0; i<dstW; i++) {
 412         int val = dither[(i + offset) & 7] << 12;
 413         int j;
 414         for (j=0; j<filterSize; j++)
 415             val += src[j][i] * filter[j];
 416
 417         dest[i]= av_clip_uint8(val>>19);
 418     }
 419 }
 420
 421 static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
 422                            const uint8_t *dither, int offset)
 423 {
 424     int i;
 425     for (i=0; i<dstW; i++) {
 426         int val = (src[i] + dither[(i + offset) & 7]) >> 7;
 427         dest[i]= av_clip_uint8(val);
 428     }
 429 }
 430
 431 static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
 432                         const int16_t **chrUSrc, const int16_t **chrVSrc,
 433                         uint8_t *dest, int chrDstW)
 434 {
 435     enum PixelFormat dstFormat = c->dstFormat;
 436     const uint8_t *chrDither = c->chrDither8;
 437     int i;
 438
 439     if (dstFormat == PIX_FMT_NV12)
 440         for (i=0; i<chrDstW; i++) {
 441             int u = chrDither[i & 7] << 12;
 442             int v = chrDither[(i + 3) & 7] << 12;
 443             int j;
 444             for (j=0; j<chrFilterSize; j++) {
 445                 u += chrUSrc[j][i] * chrFilter[j];
 446                 v += chrVSrc[j][i] * chrFilter[j];
 447             }
 448
 449             dest[2*i]= av_clip_uint8(u>>19);
 450             dest[2*i+1]= av_clip_uint8(v>>19);
 451         }
 452     else
 453         for (i=0; i<chrDstW; i++) {
 454             int u = chrDither[i & 7] << 12;
 455             int v = chrDither[(i + 3) & 7] << 12;
 456             int j;
 457             for (j=0; j<chrFilterSize; j++) {
 458                 u += chrUSrc[j][i] * chrFilter[j];
 459                 v += chrVSrc[j][i] * chrFilter[j];
 460             }
 461
 462             dest[2*i]= av_clip_uint8(v>>19);
 463             dest[2*i+1]= av_clip_uint8(u>>19);
 464         }
 465 }
 466
 467 #define output_pixel(pos, val) \
 468         if (target == PIX_FMT_GRAY16BE) { \
 469             AV_WB16(pos, val); \
 470         } else { \
 471             AV_WL16(pos, val); \
 472         }
 473
 474 static av_always_inline void
 475 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 476                         const int32_t **lumSrc, int lumFilterSize,
 477                         const int16_t *chrFilter, const int32_t **chrUSrc,
 478                         const int32_t **chrVSrc, int chrFilterSize,
 479                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 480                         int y, enum PixelFormat target)
 481 {
 482     int i;
 483
 484     for (i = 0; i < (dstW >> 1); i++) {
 485         int j;
 486         int Y1 = (1 << 14) - 0x40000000;
 487         int Y2 = (1 << 14) - 0x40000000;
 488
 489         for (j = 0; j < lumFilterSize; j++) {
 490             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 491             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 492         }
 493         Y1 >>= 15;
 494         Y2 >>= 15;
 495         Y1 = av_clip_int16(Y1);
 496         Y2 = av_clip_int16(Y2);
 497         output_pixel(&dest[i * 2 + 0], 0x8000 + Y1);
 498         output_pixel(&dest[i * 2 + 1], 0x8000 + Y2);
 499     }
 500 }
 501
 502 static av_always_inline void
 503 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 504                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 505                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 506                         int yalpha, int uvalpha, int y,
 507                         enum PixelFormat target)
 508 {
 509     int  yalpha1 = 4095 - yalpha;
 510     int i;
 511     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 512
 513     for (i = 0; i < (dstW >> 1); i++) {
 514         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 515         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 516
 517         output_pixel(&dest[i * 2 + 0], Y1);
 518         output_pixel(&dest[i * 2 + 1], Y2);
 519     }
 520 }
 521
 522 static av_always_inline void
 523 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 524                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 525                         const int32_t *abuf0, uint16_t *dest, int dstW,
 526                         int uvalpha, int y, enum PixelFormat target)
 527 {
 528     int i;
 529
 530     for (i = 0; i < (dstW >> 1); i++) {
 531         int Y1 = (buf0[i * 2    ]+4)>>3;
 532         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 533
 534         output_pixel(&dest[i * 2 + 0], Y1);
 535         output_pixel(&dest[i * 2 + 1], Y2);
 536     }
 537 }
 538
 539 #undef output_pixel
 540
 541 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 542 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 543                         const int16_t **_lumSrc, int lumFilterSize, \
 544                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 545                         const int16_t **_chrVSrc, int chrFilterSize, \
 546                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 547                         int y) \
 548 { \
 549     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 550                   **chrUSrc = (const int32_t **) _chrUSrc, \
 551                   **chrVSrc = (const int32_t **) _chrVSrc, \
 552                   **alpSrc  = (const int32_t **) _alpSrc; \
 553     uint16_t *dest = (uint16_t *) _dest; \
 554     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 555                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 556                           alpSrc, dest, dstW, y, fmt); \
 557 } \
 558  \
 559 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 560                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 561                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 562                         int yalpha, int uvalpha, int y) \
 563 { \
 564     const int32_t **buf  = (const int32_t **) _buf, \
 565                   **ubuf = (const int32_t **) _ubuf, \
 566                   **vbuf = (const int32_t **) _vbuf, \
 567                   **abuf = (const int32_t **) _abuf; \
 568     uint16_t *dest = (uint16_t *) _dest; \
 569     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 570                           dest, dstW, yalpha, uvalpha, y, fmt); \
 571 } \
 572  \
 573 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 574                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 575                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 576                         int uvalpha, int y) \
 577 { \
 578     const int32_t *buf0  = (const int32_t *)  _buf0, \
 579                  **ubuf  = (const int32_t **) _ubuf, \
 580                  **vbuf  = (const int32_t **) _vbuf, \
 581                   *abuf0 = (const int32_t *)  _abuf0; \
 582     uint16_t *dest = (uint16_t *) _dest; \
 583     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 584                                   dstW, uvalpha, y, fmt); \
 585 }
 586
 587 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE)
 588 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE)
 589
 590 #define output_pixel(pos, acc) \
 591     if (target == PIX_FMT_MONOBLACK) { \
 592         pos = acc; \
 593     } else { \
 594         pos = ~acc; \
 595     }
 596
 597 static av_always_inline void
 598 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 599                       const int16_t **lumSrc, int lumFilterSize,
 600                       const int16_t *chrFilter, const int16_t **chrUSrc,
 601                       const int16_t **chrVSrc, int chrFilterSize,
 602                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 603                       int y, enum PixelFormat target)
 604 {
 605     const uint8_t * const d128=dither_8x8_220[y&7];
 606     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 607     int i;
 608     unsigned acc = 0;
 609
 610     for (i = 0; i < dstW - 1; i += 2) {
 611         int j;
 612         int Y1 = 1 << 18;
 613         int Y2 = 1 << 18;
 614
 615         for (j = 0; j < lumFilterSize; j++) {
 616             Y1 += lumSrc[j][i]   * lumFilter[j];
 617             Y2 += lumSrc[j][i+1] * lumFilter[j];
 618         }
 619         Y1 >>= 19;
 620         Y2 >>= 19;
 621         if ((Y1 | Y2) & 0x100) {
 622             Y1 = av_clip_uint8(Y1);
 623             Y2 = av_clip_uint8(Y2);
 624         }
 625         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 626         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 627         if ((i & 7) == 6) {
 628             output_pixel(*dest++, acc);
 629         }
 630     }
 631 }
 632
 633 static av_always_inline void
 634 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 635                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 636                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 637                       int yalpha, int uvalpha, int y,
 638                       enum PixelFormat target)
 639 {
 640     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 641     const uint8_t * const d128 = dither_8x8_220[y & 7];
 642     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 643     int  yalpha1 = 4095 - yalpha;
 644     int i;
 645
 646     for (i = 0; i < dstW - 7; i += 8) {
 647         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 648         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 649         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 650         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 651         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 652         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 653         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 654         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 655         output_pixel(*dest++, acc);
 656     }
 657 }
 658
 659 static av_always_inline void
 660 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 661                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 662                       const int16_t *abuf0, uint8_t *dest, int dstW,
 663                       int uvalpha, int y, enum PixelFormat target)
 664 {
 665     const uint8_t * const d128 = dither_8x8_220[y & 7];
 666     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 667     int i;
 668
 669     for (i = 0; i < dstW - 7; i += 8) {
 670         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 671         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 672         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 673         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 674         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 675         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 676         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 677         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 678         output_pixel(*dest++, acc);
 679     }
 680 }
 681
 682 #undef output_pixel
 683
 684 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 685 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 686                                 const int16_t **lumSrc, int lumFilterSize, \
 687                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 688                                 const int16_t **chrVSrc, int chrFilterSize, \
 689                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 690                                 int y) \
 691 { \
 692     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 693                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 694                                   alpSrc, dest, dstW, y, fmt); \
 695 } \
 696  \
 697 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 698                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 699                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 700                                 int yalpha, int uvalpha, int y) \
 701 { \
 702     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 703                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 704 } \
 705  \
 706 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 707                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 708                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 709                                 int uvalpha, int y) \
 710 { \
 711     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 712                                   abuf0, dest, dstW, uvalpha, \
 713                                   y, fmt); \
 714 }
 715
 716 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE)
 717 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK)
 718
 719 #define output_pixels(pos, Y1, U, Y2, V) \
 720     if (target == PIX_FMT_YUYV422) { \
 721         dest[pos + 0] = Y1; \
 722         dest[pos + 1] = U;  \
 723         dest[pos + 2] = Y2; \
 724         dest[pos + 3] = V;  \
 725     } else { \
 726         dest[pos + 0] = U;  \
 727         dest[pos + 1] = Y1; \
 728         dest[pos + 2] = V;  \
 729         dest[pos + 3] = Y2; \
 730     }
 731
 732 static av_always_inline void
 733 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 734                      const int16_t **lumSrc, int lumFilterSize,
 735                      const int16_t *chrFilter, const int16_t **chrUSrc,
 736                      const int16_t **chrVSrc, int chrFilterSize,
 737                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 738                      int y, enum PixelFormat target)
 739 {
 740     int i;
 741
 742     for (i = 0; i < (dstW >> 1); i++) {
 743         int j;
 744         int Y1 = 1 << 18;
 745         int Y2 = 1 << 18;
 746         int U  = 1 << 18;
 747         int V  = 1 << 18;
 748
 749         for (j = 0; j < lumFilterSize; j++) {
 750             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 751             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 752         }
 753         for (j = 0; j < chrFilterSize; j++) {
 754             U += chrUSrc[j][i] * chrFilter[j];
 755             V += chrVSrc[j][i] * chrFilter[j];
 756         }
 757         Y1 >>= 19;
 758         Y2 >>= 19;
 759         U  >>= 19;
 760         V  >>= 19;
 761         if ((Y1 | Y2 | U | V) & 0x100) {
 762             Y1 = av_clip_uint8(Y1);
 763             Y2 = av_clip_uint8(Y2);
 764             U  = av_clip_uint8(U);
 765             V  = av_clip_uint8(V);
 766         }
 767         output_pixels(4*i, Y1, U, Y2, V);
 768     }
 769 }
 770
 771 static av_always_inline void
 772 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 773                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 774                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 775                      int yalpha, int uvalpha, int y,
 776                      enum PixelFormat target)
 777 {
 778     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 779                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 780                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 781     int  yalpha1 = 4095 - yalpha;
 782     int uvalpha1 = 4095 - uvalpha;
 783     int i;
 784
 785     for (i = 0; i < (dstW >> 1); i++) {
 786         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 787         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 788         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 789         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 790
 791         output_pixels(i * 4, Y1, U, Y2, V);
 792     }
 793 }
 794
 795 static av_always_inline void
 796 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 797                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 798                      const int16_t *abuf0, uint8_t *dest, int dstW,
 799                      int uvalpha, int y, enum PixelFormat target)
 800 {
 801     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 802                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 803     int i;
 804
 805     if (uvalpha < 2048) {
 806         for (i = 0; i < (dstW >> 1); i++) {
 807             int Y1 = buf0[i * 2]     >> 7;
 808             int Y2 = buf0[i * 2 + 1] >> 7;
 809             int U  = ubuf1[i]        >> 7;
 810             int V  = vbuf1[i]        >> 7;
 811
 812             output_pixels(i * 4, Y1, U, Y2, V);
 813         }
 814     } else {
 815         for (i = 0; i < (dstW >> 1); i++) {
 816             int Y1 =  buf0[i * 2]          >> 7;
 817             int Y2 =  buf0[i * 2 + 1]      >> 7;
 818             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 819             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 820
 821             output_pixels(i * 4, Y1, U, Y2, V);
 822         }
 823     }
 824 }
 825
 826 #undef output_pixels
 827
 828 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422)
 829 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422)
 830
 831 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 832 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 833 #define output_pixel(pos, val) \
 834     if (isBE(target)) { \
 835         AV_WB16(pos, val); \
 836     } else { \
 837         AV_WL16(pos, val); \
 838     }
 839
 840 static av_always_inline void
 841 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 842                        const int32_t **lumSrc, int lumFilterSize,
 843                        const int16_t *chrFilter, const int32_t **chrUSrc,
 844                        const int32_t **chrVSrc, int chrFilterSize,
 845                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 846                        int y, enum PixelFormat target)
 847 {
 848     int i;
 849
 850     for (i = 0; i < (dstW >> 1); i++) {
 851         int j;
 852         int Y1 = -0x40000000;
 853         int Y2 = -0x40000000;
 854         int U  = -128 << 23; // 19
 855         int V  = -128 << 23;
 856         int R, G, B;
 857
 858         for (j = 0; j < lumFilterSize; j++) {
 859             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 860             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 861         }
 862         for (j = 0; j < chrFilterSize; j++) {
 863             U += chrUSrc[j][i] * chrFilter[j];
 864             V += chrVSrc[j][i] * chrFilter[j];
 865         }
 866
 867         // 8bit: 12+15=27; 16-bit: 12+19=31
 868         Y1 >>= 14; // 10
 869         Y1 += 0x10000;
 870         Y2 >>= 14;
 871         Y2 += 0x10000;
 872         U  >>= 14;
 873         V  >>= 14;
 874
 875         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 876         Y1 -= c->yuv2rgb_y_offset;
 877         Y2 -= c->yuv2rgb_y_offset;
 878         Y1 *= c->yuv2rgb_y_coeff;
 879         Y2 *= c->yuv2rgb_y_coeff;
 880         Y1 += 1 << 13; // 21
 881         Y2 += 1 << 13;
 882         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 883
 884         R = V * c->yuv2rgb_v2r_coeff;
 885         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 886         B =                            U * c->yuv2rgb_u2b_coeff;
 887
 888         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 889         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 890         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 891         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 892         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 893         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 894         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 895         dest += 6;
 896     }
 897 }
 898
 899 static av_always_inline void
 900 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 901                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 902                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 903                        int yalpha, int uvalpha, int y,
 904                        enum PixelFormat target)
 905 {
 906     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 907                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 908                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 909     int  yalpha1 = 4095 - yalpha;
 910     int uvalpha1 = 4095 - uvalpha;
 911     int i;
 912
 913     for (i = 0; i < (dstW >> 1); i++) {
 914         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 915         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 916         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 917         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 918         int R, G, B;
 919
 920         Y1 -= c->yuv2rgb_y_offset;
 921         Y2 -= c->yuv2rgb_y_offset;
 922         Y1 *= c->yuv2rgb_y_coeff;
 923         Y2 *= c->yuv2rgb_y_coeff;
 924         Y1 += 1 << 13;
 925         Y2 += 1 << 13;
 926
 927         R = V * c->yuv2rgb_v2r_coeff;
 928         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 929         B =                            U * c->yuv2rgb_u2b_coeff;
 930
 931         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 932         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 933         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 934         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 935         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 936         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 937         dest += 6;
 938     }
 939 }
 940
 941 static av_always_inline void
 942 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 943                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 944                        const int32_t *abuf0, uint16_t *dest, int dstW,
 945                        int uvalpha, int y, enum PixelFormat target)
 946 {
 947     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 948                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 949     int i;
 950
 951     if (uvalpha < 2048) {
 952         for (i = 0; i < (dstW >> 1); i++) {
 953             int Y1 = (buf0[i * 2]    ) >> 2;
 954             int Y2 = (buf0[i * 2 + 1]) >> 2;
 955             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 956             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 957             int R, G, B;
 958
 959             Y1 -= c->yuv2rgb_y_offset;
 960             Y2 -= c->yuv2rgb_y_offset;
 961             Y1 *= c->yuv2rgb_y_coeff;
 962             Y2 *= c->yuv2rgb_y_coeff;
 963             Y1 += 1 << 13;
 964             Y2 += 1 << 13;
 965
 966             R = V * c->yuv2rgb_v2r_coeff;
 967             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 968             B =                            U * c->yuv2rgb_u2b_coeff;
 969
 970             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 971             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 972             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 973             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 974             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 975             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 976             dest += 6;
 977         }
 978     } else {
 979         for (i = 0; i < (dstW >> 1); i++) {
 980             int Y1 = (buf0[i * 2]    ) >> 2;
 981             int Y2 = (buf0[i * 2 + 1]) >> 2;
 982             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
 983             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
 984             int R, G, B;
 985
 986             Y1 -= c->yuv2rgb_y_offset;
 987             Y2 -= c->yuv2rgb_y_offset;
 988             Y1 *= c->yuv2rgb_y_coeff;
 989             Y2 *= c->yuv2rgb_y_coeff;
 990             Y1 += 1 << 13;
 991             Y2 += 1 << 13;
 992
 993             R = V * c->yuv2rgb_v2r_coeff;
 994             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 995             B =                            U * c->yuv2rgb_u2b_coeff;
 996
 997             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 998             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 999             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1000             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1001             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1002             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1003             dest += 6;
1004         }
1005     }
1006 }
1007
1008 #undef output_pixel
1009 #undef r_b
1010 #undef b_r
1011
1012 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE)
1013 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE)
1014 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE)
1015 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE)
1016
1017 static av_always_inline void
1018 yuv2rgb_write(uint8_t *_dest, int i, unsigned Y1, unsigned Y2,
1019               unsigned U, unsigned V, unsigned A1, unsigned A2,
1020               const void *_r, const void *_g, const void *_b, int y,
1021               enum PixelFormat target, int hasAlpha)
1022 {
1023     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
1024         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1025         uint32_t *dest = (uint32_t *) _dest;
1026         const uint32_t *r = (const uint32_t *) _r;
1027         const uint32_t *g = (const uint32_t *) _g;
1028         const uint32_t *b = (const uint32_t *) _b;
1029
1030 #if CONFIG_SMALL
1031         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1032
1033         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1034         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1035 #else
1036         if (hasAlpha) {
1037             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1038
1039             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1040             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1041         } else {
1042             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1043             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1044         }
1045 #endif
1046     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1047         uint8_t *dest = (uint8_t *) _dest;
1048         const uint8_t *r = (const uint8_t *) _r;
1049         const uint8_t *g = (const uint8_t *) _g;
1050         const uint8_t *b = (const uint8_t *) _b;
1051
1052 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1053 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1054
1055         dest[i * 6 + 0] = r_b[Y1];
1056         dest[i * 6 + 1] =   g[Y1];
1057         dest[i * 6 + 2] = b_r[Y1];
1058         dest[i * 6 + 3] = r_b[Y2];
1059         dest[i * 6 + 4] =   g[Y2];
1060         dest[i * 6 + 5] = b_r[Y2];
1061 #undef r_b
1062 #undef b_r
1063     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1064                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1065                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1066         uint16_t *dest = (uint16_t *) _dest;
1067         const uint16_t *r = (const uint16_t *) _r;
1068         const uint16_t *g = (const uint16_t *) _g;
1069         const uint16_t *b = (const uint16_t *) _b;
1070         int dr1, dg1, db1, dr2, dg2, db2;
1071
1072         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1073             dr1 = dither_2x2_8[ y & 1     ][0];
1074             dg1 = dither_2x2_4[ y & 1     ][0];
1075             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1076             dr2 = dither_2x2_8[ y & 1     ][1];
1077             dg2 = dither_2x2_4[ y & 1     ][1];
1078             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1079         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1080             dr1 = dither_2x2_8[ y & 1     ][0];
1081             dg1 = dither_2x2_8[ y & 1     ][1];
1082             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1083             dr2 = dither_2x2_8[ y & 1     ][1];
1084             dg2 = dither_2x2_8[ y & 1     ][0];
1085             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1086         } else {
1087             dr1 = dither_4x4_16[ y & 3     ][0];
1088             dg1 = dither_4x4_16[ y & 3     ][1];
1089             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1090             dr2 = dither_4x4_16[ y & 3     ][1];
1091             dg2 = dither_4x4_16[ y & 3     ][0];
1092             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1093         }
1094
1095         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1096         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1097     } else /* 8/4-bit */ {
1098         uint8_t *dest = (uint8_t *) _dest;
1099         const uint8_t *r = (const uint8_t *) _r;
1100         const uint8_t *g = (const uint8_t *) _g;
1101         const uint8_t *b = (const uint8_t *) _b;
1102         int dr1, dg1, db1, dr2, dg2, db2;
1103
1104         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1105             const uint8_t * const d64 = dither_8x8_73[y & 7];
1106             const uint8_t * const d32 = dither_8x8_32[y & 7];
1107             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1108             db1 =       d64[(i * 2 + 0) & 7];
1109             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1110             db2 =       d64[(i * 2 + 1) & 7];
1111         } else {
1112             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1113             const uint8_t * const d128 = dither_8x8_220[y & 7];
1114             dr1 = db1 = d128[(i * 2 + 0) & 7];
1115             dg1 =        d64[(i * 2 + 0) & 7];
1116             dr2 = db2 = d128[(i * 2 + 1) & 7];
1117             dg2 =        d64[(i * 2 + 1) & 7];
1118         }
1119
1120         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1121             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1122                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1123         } else {
1124             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1125             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1126         }
1127     }
1128 }
1129
1130 static av_always_inline void
1131 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1132                      const int16_t **lumSrc, int lumFilterSize,
1133                      const int16_t *chrFilter, const int16_t **chrUSrc,
1134                      const int16_t **chrVSrc, int chrFilterSize,
1135                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1136                      int y, enum PixelFormat target, int hasAlpha)
1137 {
1138     int i;
1139
1140     for (i = 0; i < (dstW >> 1); i++) {
1141         int j;
1142         int Y1 = 1 << 18;
1143         int Y2 = 1 << 18;
1144         int U  = 1 << 18;
1145         int V  = 1 << 18;
1146         int av_unused A1, A2;
1147         const void *r, *g, *b;
1148
1149         for (j = 0; j < lumFilterSize; j++) {
1150             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1151             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1152         }
1153         for (j = 0; j < chrFilterSize; j++) {
1154             U += chrUSrc[j][i] * chrFilter[j];
1155             V += chrVSrc[j][i] * chrFilter[j];
1156         }
1157         Y1 >>= 19;
1158         Y2 >>= 19;
1159         U  >>= 19;
1160         V  >>= 19;
1161         if ((Y1 | Y2 | U | V) & 0x100) {
1162             Y1 = av_clip_uint8(Y1);
1163             Y2 = av_clip_uint8(Y2);
1164             U  = av_clip_uint8(U);
1165             V  = av_clip_uint8(V);
1166         }
1167         if (hasAlpha) {
1168             A1 = 1 << 18;
1169             A2 = 1 << 18;
1170             for (j = 0; j < lumFilterSize; j++) {
1171                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1172                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1173             }
1174             A1 >>= 19;
1175             A2 >>= 19;
1176             if ((A1 | A2) & 0x100) {
1177                 A1 = av_clip_uint8(A1);
1178                 A2 = av_clip_uint8(A2);
1179             }
1180         }
1181
1182         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1183         r =  c->table_rV[V];
1184         g = (c->table_gU[U] + c->table_gV[V]);
1185         b =  c->table_bU[U];
1186
1187         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1188                       r, g, b, y, target, hasAlpha);
1189     }
1190 }
1191
1192 static av_always_inline void
1193 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1194                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1195                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1196                      int yalpha, int uvalpha, int y,
1197                      enum PixelFormat target, int hasAlpha)
1198 {
1199     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1200                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1201                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1202                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1203                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1204     int  yalpha1 = 4095 - yalpha;
1205     int uvalpha1 = 4095 - uvalpha;
1206     int i;
1207
1208     for (i = 0; i < (dstW >> 1); i++) {
1209         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1210         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1211         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1212         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1213         int A1, A2;
1214         const void *r =  c->table_rV[V],
1215                    *g = (c->table_gU[U] + c->table_gV[V]),
1216                    *b =  c->table_bU[U];
1217
1218         if (hasAlpha) {
1219             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1220             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1221         }
1222
1223         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1224                       r, g, b, y, target, hasAlpha);
1225     }
1226 }
1227
1228 static av_always_inline void
1229 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1230                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1231                      const int16_t *abuf0, uint8_t *dest, int dstW,
1232                      int uvalpha, int y, enum PixelFormat target,
1233                      int hasAlpha)
1234 {
1235     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1236                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1237     int i;
1238
1239     if (uvalpha < 2048) {
1240         for (i = 0; i < (dstW >> 1); i++) {
1241             int Y1 = buf0[i * 2]     >> 7;
1242             int Y2 = buf0[i * 2 + 1] >> 7;
1243             int U  = ubuf1[i]        >> 7;
1244             int V  = vbuf1[i]        >> 7;
1245             int A1, A2;
1246             const void *r =  c->table_rV[V],
1247                        *g = (c->table_gU[U] + c->table_gV[V]),
1248                        *b =  c->table_bU[U];
1249
1250             if (hasAlpha) {
1251                 A1 = abuf0[i * 2    ] >> 7;
1252                 A2 = abuf0[i * 2 + 1] >> 7;
1253             }
1254
1255             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1256                           r, g, b, y, target, hasAlpha);
1257         }
1258     } else {
1259         for (i = 0; i < (dstW >> 1); i++) {
1260             int Y1 =  buf0[i * 2]          >> 7;
1261             int Y2 =  buf0[i * 2 + 1]      >> 7;
1262             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1263             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1264             int A1, A2;
1265             const void *r =  c->table_rV[V],
1266                        *g = (c->table_gU[U] + c->table_gV[V]),
1267                        *b =  c->table_bU[U];
1268
1269             if (hasAlpha) {
1270                 A1 = abuf0[i * 2    ] >> 7;
1271                 A2 = abuf0[i * 2 + 1] >> 7;
1272             }
1273
1274             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1275                           r, g, b, y, target, hasAlpha);
1276         }
1277     }
1278 }
1279
1280 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1281 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1282                                 const int16_t **lumSrc, int lumFilterSize, \
1283                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1284                                 const int16_t **chrVSrc, int chrFilterSize, \
1285                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1286                                 int y) \
1287 { \
1288     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1289                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1290                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1291 }
1292 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1293 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1294 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1295                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1296                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1297                                 int yalpha, int uvalpha, int y) \
1298 { \
1299     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1300                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1301 } \
1302  \
1303 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1304                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1305                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1306                                 int uvalpha, int y) \
1307 { \
1308     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1309                                   dstW, uvalpha, y, fmt, hasAlpha); \
1310 }
1311
1312 #if CONFIG_SMALL
1313 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1314 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1315 #else
1316 #if CONFIG_SWSCALE_ALPHA
1317 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1)
1318 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1)
1319 #endif
1320 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0)
1321 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0)
1322 #endif
1323 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0)
1324 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0)
1325 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0)
1326 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0)
1327 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0)
1328 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0)
1329 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0)
1330 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0)
1331
1332 static av_always_inline void
1333 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1334                           const int16_t **lumSrc, int lumFilterSize,
1335                           const int16_t *chrFilter, const int16_t **chrUSrc,
1336                           const int16_t **chrVSrc, int chrFilterSize,
1337                           const int16_t **alpSrc, uint8_t *dest,
1338                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1339 {
1340     int i;
1341     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1342
1343     for (i = 0; i < dstW; i++) {
1344         int j;
1345         int Y = 1<<9;
1346         int U = (1<<9)-(128 << 19);
1347         int V = (1<<9)-(128 << 19);
1348         int av_unused A;
1349         int R, G, B;
1350
1351         for (j = 0; j < lumFilterSize; j++) {
1352             Y += lumSrc[j][i] * lumFilter[j];
1353         }
1354         for (j = 0; j < chrFilterSize; j++) {
1355             U += chrUSrc[j][i] * chrFilter[j];
1356             V += chrVSrc[j][i] * chrFilter[j];
1357         }
1358         Y >>= 10;
1359         U >>= 10;
1360         V >>= 10;
1361         if (hasAlpha) {
1362             A = 1 << 18;
1363             for (j = 0; j < lumFilterSize; j++) {
1364                 A += alpSrc[j][i] * lumFilter[j];
1365             }
1366             A >>= 19;
1367             if (A & 0x100)
1368                 A = av_clip_uint8(A);
1369         }
1370         Y -= c->yuv2rgb_y_offset;
1371         Y *= c->yuv2rgb_y_coeff;
1372         Y += 1 << 21;
1373         R = Y + V*c->yuv2rgb_v2r_coeff;
1374         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1375         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1376         if ((R | G | B) & 0xC0000000) {
1377             R = av_clip_uintp2(R, 30);
1378             G = av_clip_uintp2(G, 30);
1379             B = av_clip_uintp2(B, 30);
1380         }
1381
1382         switch(target) {
1383         case PIX_FMT_ARGB:
1384             dest[0] = hasAlpha ? A : 255;
1385             dest[1] = R >> 22;
1386             dest[2] = G >> 22;
1387             dest[3] = B >> 22;
1388             break;
1389         case PIX_FMT_RGB24:
1390             dest[0] = R >> 22;
1391             dest[1] = G >> 22;
1392             dest[2] = B >> 22;
1393             break;
1394         case PIX_FMT_RGBA:
1395             dest[0] = R >> 22;
1396             dest[1] = G >> 22;
1397             dest[2] = B >> 22;
1398             dest[3] = hasAlpha ? A : 255;
1399             break;
1400         case PIX_FMT_ABGR:
1401             dest[0] = hasAlpha ? A : 255;
1402             dest[1] = B >> 22;
1403             dest[2] = G >> 22;
1404             dest[3] = R >> 22;
1405             break;
1406         case PIX_FMT_BGR24:
1407             dest[0] = B >> 22;
1408             dest[1] = G >> 22;
1409             dest[2] = R >> 22;
1410             break;
1411         case PIX_FMT_BGRA:
1412             dest[0] = B >> 22;
1413             dest[1] = G >> 22;
1414             dest[2] = R >> 22;
1415             dest[3] = hasAlpha ? A : 255;
1416             break;
1417         }
1418         dest += step;
1419     }
1420 }
1421
1422 #if CONFIG_SMALL
1423 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1424 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1425 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1426 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
1427 #else
1428 #if CONFIG_SWSCALE_ALPHA
1429 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1)
1430 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1)
1431 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1)
1432 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1)
1433 #endif
1434 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0)
1435 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0)
1436 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0)
1437 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0)
1438 #endif
1439 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0)
1440 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0)
1441
1442 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1443                                        int width, int height,
1444                                        int y, uint8_t val)
1445 {
1446     int i;
1447     uint8_t *ptr = plane + stride*y;
1448     for (i=0; i<height; i++) {
1449         memset(ptr, val, width);
1450         ptr += stride;
1451     }
1452 }
1453
1454 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1455
1456 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1457 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1458
1459 static av_always_inline void
1460 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1461                     enum PixelFormat origin)
1462 {
1463     int i;
1464     for (i = 0; i < width; i++) {
1465         unsigned int r_b = input_pixel(&src[i*3+0]);
1466         unsigned int   g = input_pixel(&src[i*3+1]);
1467         unsigned int b_r = input_pixel(&src[i*3+2]);
1468
1469         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1470     }
1471 }
1472
1473 static av_always_inline void
1474 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1475                     const uint16_t *src1, const uint16_t *src2,
1476                     int width, enum PixelFormat origin)
1477 {
1478     int i;
1479     assert(src1==src2);
1480     for (i = 0; i < width; i++) {
1481         int r_b = input_pixel(&src1[i*3+0]);
1482         int   g = input_pixel(&src1[i*3+1]);
1483         int b_r = input_pixel(&src1[i*3+2]);
1484
1485         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1486         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1487     }
1488 }
1489
1490 static av_always_inline void
1491 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1492                           const uint16_t *src1, const uint16_t *src2,
1493                           int width, enum PixelFormat origin)
1494 {
1495     int i;
1496     assert(src1==src2);
1497     for (i = 0; i < width; i++) {
1498         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1499         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1500         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1501
1502         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1503         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1504     }
1505 }
1506
1507 #undef r
1508 #undef b
1509 #undef input_pixel
1510
1511 #define rgb48funcs(pattern, BE_LE, origin) \
1512 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
1513                                     int width, uint32_t *unused) \
1514 { \
1515     const uint16_t *src = (const uint16_t *) _src; \
1516     uint16_t *dst = (uint16_t *) _dst; \
1517     rgb48ToY_c_template(dst, src, width, origin); \
1518 } \
1519  \
1520 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1521                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1522                                     int width, uint32_t *unused) \
1523 { \
1524     const uint16_t *src1 = (const uint16_t *) _src1, \
1525                    *src2 = (const uint16_t *) _src2; \
1526     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1527     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1528 } \
1529  \
1530 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1531                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1532                                     int width, uint32_t *unused) \
1533 { \
1534     const uint16_t *src1 = (const uint16_t *) _src1, \
1535                    *src2 = (const uint16_t *) _src2; \
1536     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1537     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1538 }
1539
1540 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE)
1541 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE)
1542 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE)
1543 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE)
1544
1545 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1546                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1547                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1548
1549 static av_always_inline void
1550 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1551                        int width, enum PixelFormat origin,
1552                        int shr,   int shg,   int shb, int shp,
1553                        int maskr, int maskg, int maskb,
1554                        int rsh,   int gsh,   int bsh, int S)
1555 {
1556     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh;
1557     const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
1558     int i;
1559
1560     for (i = 0; i < width; i++) {
1561         int px = input_pixel(i) >> shp;
1562         int b = (px & maskb) >> shb;
1563         int g = (px & maskg) >> shg;
1564         int r = (px & maskr) >> shr;
1565
1566         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1567     }
1568 }
1569
1570 static av_always_inline void
1571 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1572                         const uint8_t *src, int width,
1573                         enum PixelFormat origin,
1574                         int shr,   int shg,   int shb, int shp,
1575                         int maskr, int maskg, int maskb,
1576                         int rsh,   int gsh,   int bsh, int S)
1577 {
1578     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1579               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh;
1580     const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
1581     int i;
1582
1583     for (i = 0; i < width; i++) {
1584         int px = input_pixel(i) >> shp;
1585         int b = (px & maskb) >> shb;
1586         int g = (px & maskg) >> shg;
1587         int r = (px & maskr) >> shr;
1588
1589         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1590         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1591     }
1592 }
1593
1594 static av_always_inline void
1595 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1596                              const uint8_t *src, int width,
1597                              enum PixelFormat origin,
1598                              int shr,   int shg,   int shb, int shp,
1599                              int maskr, int maskg, int maskb,
1600                              int rsh,   int gsh,   int bsh, int S)
1601 {
1602     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1603               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1604               maskgx = ~(maskr | maskb);
1605     const unsigned rnd = (256U<<(S)) + (1<<(S-6));
1606     int i;
1607
1608     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1609     for (i = 0; i < width; i++) {
1610         int px0 = input_pixel(2 * i + 0) >> shp;
1611         int px1 = input_pixel(2 * i + 1) >> shp;
1612         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1613         int rb = px0 + px1 - g;
1614
1615         b = (rb & maskb) >> shb;
1616         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1617             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1618             g >>= shg;
1619         } else {
1620             g = (g  & maskg) >> shg;
1621         }
1622         r = (rb & maskr) >> shr;
1623
1624         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1625         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1626     }
1627 }
1628
1629 #undef input_pixel
1630
1631 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1632                          maskg, maskb, rsh, gsh, bsh, S) \
1633 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
1634                           int width, uint32_t *unused) \
1635 { \
1636     rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, \
1637                            shr, shg, shb, shp, \
1638                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1639 } \
1640  \
1641 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1642                            const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1643                            int width, uint32_t *unused) \
1644 { \
1645     rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,  \
1646                             shr, shg, shb, shp, \
1647                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1648 } \
1649  \
1650 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1651                                 const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1652                                 int width, uint32_t *unused) \
1653 { \
1654     rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
1655                                  shr, shg, shb, shp, \
1656                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1657 }
1658
1659 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
1660 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8)
1661 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
1662 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8)
1663 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
1664 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
1665 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
1666 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
1667 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8)
1668 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7)
1669 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8)
1670 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7)
1671
1672 static void gbr24pToUV_half_c(uint16_t *dstU, uint16_t *dstV,
1673                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
1674                          int width, enum PixelFormat origin)
1675 {
1676     int i;
1677     for (i = 0; i < width; i++) {
1678         unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
1679         unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
1680         unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
1681
1682         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1683         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1684     }
1685 }
1686
1687 static void abgrToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1688 {
1689     int i;
1690     for (i=0; i<width; i++) {
1691         dst[i]= src[4*i]<<6;
1692     }
1693 }
1694
1695 static void rgbaToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1696 {
1697     int i;
1698     for (i=0; i<width; i++) {
1699         dst[i]= src[4*i+3]<<6;
1700     }
1701 }
1702
1703 static void palToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
1704 {
1705     int i;
1706     for (i=0; i<width; i++) {
1707         int d= src[i];
1708
1709         dst[i]= (pal[d] >> 24)<<6;
1710     }
1711 }
1712
1713 static void palToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, long width, uint32_t *pal)
1714 {
1715     int i;
1716     for (i=0; i<width; i++) {
1717         int d= src[i];
1718
1719         dst[i]= (pal[d] & 0xFF)<<6;
1720     }
1721 }
1722
1723 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1724                            const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1725                            int width, uint32_t *pal)
1726 {
1727     int i;
1728     assert(src1 == src2);
1729     for (i=0; i<width; i++) {
1730         int p= pal[src1[i]];
1731
1732         dstU[i]= (uint8_t)(p>> 8)<<6;
1733         dstV[i]= (uint8_t)(p>>16)<<6;
1734     }
1735 }
1736
1737 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1738 {
1739     int i, j;
1740     for (i=0; i<width/8; i++) {
1741         int d= ~src[i];
1742         for(j=0; j<8; j++)
1743             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1744     }
1745     if(width&7){
1746         int d= ~src[i];
1747         for(j=0; j<(width&7); j++)
1748             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1749     }
1750 }
1751
1752 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1753 {
1754     int i, j;
1755     for (i=0; i<width/8; i++) {
1756         int d= src[i];
1757         for(j=0; j<8; j++)
1758             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1759     }
1760     if(width&7){
1761         int d= src[i];
1762         for(j=0; j<(width&7); j++)
1763             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1764     }
1765 }
1766
1767 //FIXME yuy2* can read up to 7 samples too much
1768
1769 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1770                       uint32_t *unused)
1771 {
1772     int i;
1773     for (i=0; i<width; i++)
1774         dst[i]= src[2*i];
1775 }
1776
1777 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1778                        const uint8_t *src2, int width, uint32_t *unused)
1779 {
1780     int i;
1781     for (i=0; i<width; i++) {
1782         dstU[i]= src1[4*i + 1];
1783         dstV[i]= src1[4*i + 3];
1784     }
1785     assert(src1 == src2);
1786 }
1787
1788 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1789 {
1790     int i;
1791     const uint16_t *src = (const uint16_t *) _src;
1792     uint16_t *dst = (uint16_t *) _dst;
1793     for (i=0; i<width; i++) {
1794         dst[i] = av_bswap16(src[i]);
1795     }
1796 }
1797
1798 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
1799                         const uint8_t *_src2, int width, uint32_t *unused)
1800 {
1801     int i;
1802     const uint16_t *src1 = (const uint16_t *) _src1,
1803                    *src2 = (const uint16_t *) _src2;
1804     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1805     for (i=0; i<width; i++) {
1806         dstU[i] = av_bswap16(src1[i]);
1807         dstV[i] = av_bswap16(src2[i]);
1808     }
1809 }
1810
1811 /* This is almost identical to the previous, end exists only because
1812  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1813 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1814                       uint32_t *unused)
1815 {
1816     int i;
1817     for (i=0; i<width; i++)
1818         dst[i]= src[2*i+1];
1819 }
1820
1821 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1822                        const uint8_t *src2, int width, uint32_t *unused)
1823 {
1824     int i;
1825     for (i=0; i<width; i++) {
1826         dstU[i]= src1[4*i + 0];
1827         dstV[i]= src1[4*i + 2];
1828     }
1829     assert(src1 == src2);
1830 }
1831
1832 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1833                                         const uint8_t *src, int width)
1834 {
1835     int i;
1836     for (i = 0; i < width; i++) {
1837         dst1[i] = src[2*i+0];
1838         dst2[i] = src[2*i+1];
1839     }
1840 }
1841
1842 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1843                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1844                        int width, uint32_t *unused)
1845 {
1846     nvXXtoUV_c(dstU, dstV, src1, width);
1847 }
1848
1849 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1850                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1851                        int width, uint32_t *unused)
1852 {
1853     nvXXtoUV_c(dstV, dstU, src1, width);
1854 }
1855
1856 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1857
1858 static void bgr24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
1859                        int width, uint32_t *unused)
1860 {
1861     int i;
1862     for (i=0; i<width; i++) {
1863         int b= src[i*3+0];
1864         int g= src[i*3+1];
1865         int r= src[i*3+2];
1866
1867         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1868     }
1869 }
1870
1871 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1872                         const uint8_t *src2, int width, uint32_t *unused)
1873 {
1874     int i;
1875     for (i=0; i<width; i++) {
1876         int b= src1[3*i + 0];
1877         int g= src1[3*i + 1];
1878         int r= src1[3*i + 2];
1879
1880         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1881         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1882     }
1883     assert(src1 == src2);
1884 }
1885
1886 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1887                              const uint8_t *src2, int width, uint32_t *unused)
1888 {
1889     int i;
1890     for (i=0; i<width; i++) {
1891         int b= src1[6*i + 0] + src1[6*i + 3];
1892         int g= src1[6*i + 1] + src1[6*i + 4];
1893         int r= src1[6*i + 2] + src1[6*i + 5];
1894
1895         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1896         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1897     }
1898     assert(src1 == src2);
1899 }
1900
1901 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
1902                        uint32_t *unused)
1903 {
1904     int i;
1905     for (i=0; i<width; i++) {
1906         int r= src[i*3+0];
1907         int g= src[i*3+1];
1908         int b= src[i*3+2];
1909
1910         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1911     }
1912 }
1913
1914 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1915                         const uint8_t *src2, int width, uint32_t *unused)
1916 {
1917     int i;
1918     assert(src1==src2);
1919     for (i=0; i<width; i++) {
1920         int r= src1[3*i + 0];
1921         int g= src1[3*i + 1];
1922         int b= src1[3*i + 2];
1923
1924         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1925         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1926     }
1927 }
1928
1929 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1930                                     const uint8_t *src2, int width, uint32_t *unused)
1931 {
1932     int i;
1933     assert(src1==src2);
1934     for (i=0; i<width; i++) {
1935         int r= src1[6*i + 0] + src1[6*i + 3];
1936         int g= src1[6*i + 1] + src1[6*i + 4];
1937         int b= src1[6*i + 2] + src1[6*i + 5];
1938
1939         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1940         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1941     }
1942 }
1943
1944 static void planar_rgb_to_y(uint16_t *dst, const uint8_t *src[4], int width)
1945 {
1946     int i;
1947     for (i = 0; i < width; i++) {
1948         int g = src[0][i];
1949         int b = src[1][i];
1950         int r = src[2][i];
1951
1952         dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1953     }
1954 }
1955
1956 static void planar_rgb16le_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1957 {
1958     int i;
1959     const uint16_t **src = (const uint16_t **) _src;
1960     uint16_t *dst = (uint16_t *) _dst;
1961     for (i = 0; i < width; i++) {
1962         int g = AV_RL16(src[0] + i);
1963         int b = AV_RL16(src[1] + i);
1964         int r = AV_RL16(src[2] + i);
1965
1966         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1967     }
1968 }
1969
1970 static void planar_rgb16be_to_y(uint8_t *_dst, const uint8_t *_src[4], int width)
1971 {
1972     int i;
1973     const uint16_t **src = (const uint16_t **) _src;
1974     uint16_t *dst = (uint16_t *) _dst;
1975     for (i = 0; i < width; i++) {
1976         int g = AV_RB16(src[0] + i);
1977         int b = AV_RB16(src[1] + i);
1978         int r = AV_RB16(src[2] + i);
1979
1980         dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
1981     }
1982 }
1983
1984 static void planar_rgb_to_uv(uint16_t *dstU, uint16_t *dstV, const uint8_t *src[4], int width)
1985 {
1986     int i;
1987     for (i = 0; i < width; i++) {
1988         int g = src[0][i];
1989         int b = src[1][i];
1990         int r = src[2][i];
1991
1992         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1993         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1994     }
1995 }
1996
1997 static void planar_rgb16le_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
1998 {
1999     int i;
2000     const uint16_t **src = (const uint16_t **) _src;
2001     uint16_t *dstU = (uint16_t *) _dstU;
2002     uint16_t *dstV = (uint16_t *) _dstV;
2003     for (i = 0; i < width; i++) {
2004         int g = AV_RL16(src[0] + i);
2005         int b = AV_RL16(src[1] + i);
2006         int r = AV_RL16(src[2] + i);
2007
2008         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2009         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2010     }
2011 }
2012
2013 static void planar_rgb16be_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src[4], int width)
2014 {
2015     int i;
2016     const uint16_t **src = (const uint16_t **) _src;
2017     uint16_t *dstU = (uint16_t *) _dstU;
2018     uint16_t *dstV = (uint16_t *) _dstV;
2019     for (i = 0; i < width; i++) {
2020         int g = AV_RB16(src[0] + i);
2021         int b = AV_RB16(src[1] + i);
2022         int r = AV_RB16(src[2] + i);
2023
2024         dstU[i] = (RU * r + GU * g + BU * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2025         dstV[i] = (RV * r + GV * g + BV * b + (257 << RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT + 1);
2026     }
2027 }
2028
2029 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
2030                            const int16_t *filter,
2031                            const int16_t *filterPos, int filterSize)
2032 {
2033     int i;
2034     int32_t *dst = (int32_t *) _dst;
2035     const uint16_t *src = (const uint16_t *) _src;
2036     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2037     int sh = bits - 4;
2038
2039     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2040         sh= 9;
2041
2042     for (i = 0; i < dstW; i++) {
2043         int j;
2044         int srcPos = filterPos[i];
2045         int val = 0;
2046
2047         for (j = 0; j < filterSize; j++) {
2048             val += src[srcPos + j] * filter[filterSize * i + j];
2049         }
2050         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
2051         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
2052     }
2053 }
2054
2055 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
2056                            const int16_t *filter,
2057                            const int16_t *filterPos, int filterSize)
2058 {
2059     int i;
2060     const uint16_t *src = (const uint16_t *) _src;
2061     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2062
2063     if(sh<15)
2064         sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2065
2066     for (i = 0; i < dstW; i++) {
2067         int j;
2068         int srcPos = filterPos[i];
2069         int val = 0;
2070
2071         for (j = 0; j < filterSize; j++) {
2072             val += src[srcPos + j] * filter[filterSize * i + j];
2073         }
2074         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2075         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2076     }
2077 }
2078
2079 // bilinear / bicubic scaling
2080 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2081                           const int16_t *filter, const int16_t *filterPos,
2082                           int filterSize)
2083 {
2084     int i;
2085     for (i=0; i<dstW; i++) {
2086         int j;
2087         int srcPos= filterPos[i];
2088         int val=0;
2089         for (j=0; j<filterSize; j++) {
2090             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2091         }
2092         //filter += hFilterSize;
2093         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2094         //dst[i] = val>>7;
2095     }
2096 }
2097
2098 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
2099                           const int16_t *filter, const int16_t *filterPos,
2100                           int filterSize)
2101 {
2102     int i;
2103     int32_t *dst = (int32_t *) _dst;
2104     for (i=0; i<dstW; i++) {
2105         int j;
2106         int srcPos= filterPos[i];
2107         int val=0;
2108         for (j=0; j<filterSize; j++) {
2109             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2110         }
2111         //filter += hFilterSize;
2112         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
2113         //dst[i] = val>>7;
2114     }
2115 }
2116
2117 //FIXME all pal and rgb srcFormats could do this convertion as well
2118 //FIXME all scalers more complex than bilinear could do half of this transform
2119 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2120 {
2121     int i;
2122     for (i = 0; i < width; i++) {
2123         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2124         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2125     }
2126 }
2127 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2128 {
2129     int i;
2130     for (i = 0; i < width; i++) {
2131         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2132         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2133     }
2134 }
2135 static void lumRangeToJpeg_c(int16_t *dst, int width)
2136 {
2137     int i;
2138     for (i = 0; i < width; i++)
2139         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2140 }
2141 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2142 {
2143     int i;
2144     for (i = 0; i < width; i++)
2145         dst[i] = (dst[i]*14071 + 33561947)>>14;
2146 }
2147
2148 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2149 {
2150     int i;
2151     int32_t *dstU = (int32_t *) _dstU;
2152     int32_t *dstV = (int32_t *) _dstV;
2153     for (i = 0; i < width; i++) {
2154         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2155         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2156     }
2157 }
2158 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2159 {
2160     int i;
2161     int32_t *dstU = (int32_t *) _dstU;
2162     int32_t *dstV = (int32_t *) _dstV;
2163     for (i = 0; i < width; i++) {
2164         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2165         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2166     }
2167 }
2168 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2169 {
2170     int i;
2171     int32_t *dst = (int32_t *) _dst;
2172     for (i = 0; i < width; i++)
2173         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2174 }
2175 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2176 {
2177     int i;
2178     int32_t *dst = (int32_t *) _dst;
2179     for (i = 0; i < width; i++)
2180         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2181 }
2182
2183 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2184                            const uint8_t *src, int srcW, int xInc)
2185 {
2186     int i;
2187     unsigned int xpos=0;
2188     for (i=0;i<dstWidth;i++) {
2189         register unsigned int xx=xpos>>16;
2190         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2191         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2192         xpos+=xInc;
2193     }
2194     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2195         dst[i] = src[srcW-1]*128;
2196 }
2197
2198 // *** horizontal scale Y line to temp buffer
2199 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2200                                      const uint8_t *src_in[4], int srcW, int xInc,
2201                                      const int16_t *hLumFilter,
2202                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2203                                      uint8_t *formatConvBuffer,
2204                                      uint32_t *pal, int isAlpha)
2205 {
2206     void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2207     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2208     const uint8_t *src = src_in[isAlpha ? 3 : 0];
2209
2210     if (toYV12) {
2211         toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
2212         src= formatConvBuffer;
2213     } else if (c->readLumPlanar && !isAlpha) {
2214         c->readLumPlanar(formatConvBuffer, src_in, srcW);
2215         src = formatConvBuffer;
2216     }
2217
2218     if (!c->hyscale_fast) {
2219         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2220     } else { // fast bilinear upscale / crap downscale
2221         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2222     }
2223
2224     if (convertRange)
2225         convertRange(dst, dstWidth);
2226 }
2227
2228 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2229                            int dstWidth, const uint8_t *src1,
2230                            const uint8_t *src2, int srcW, int xInc)
2231 {
2232     int i;
2233     unsigned int xpos=0;
2234     for (i=0;i<dstWidth;i++) {
2235         register unsigned int xx=xpos>>16;
2236         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2237         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2238         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2239         xpos+=xInc;
2240     }
2241     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2242         dst1[i] = src1[srcW-1]*128;
2243         dst2[i] = src2[srcW-1]*128;
2244     }
2245 }
2246
2247 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2248                                      const uint8_t *src_in[4],
2249                                      int srcW, int xInc, const int16_t *hChrFilter,
2250                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2251                                      uint8_t *formatConvBuffer, uint32_t *pal)
2252 {
2253     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
2254     if (c->chrToYV12) {
2255         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2256         c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
2257         src1= formatConvBuffer;
2258         src2= buf2;
2259     } else if (c->readChrPlanar) {
2260         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2261         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
2262         src1= formatConvBuffer;
2263         src2= buf2;
2264     }
2265
2266     if (!c->hcscale_fast) {
2267         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2268         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2269     } else { // fast bilinear upscale / crap downscale
2270         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2271     }
2272
2273     if (c->chrConvertRange)
2274         c->chrConvertRange(dst1, dst2, dstWidth);
2275 }
2276
2277 static av_always_inline void
2278 find_c_packed_planar_out_funcs(SwsContext *c,
2279                                yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX,
2280                                yuv2interleavedX_fn *yuv2nv12cX,
2281                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2282                                yuv2packedX_fn *yuv2packedX)
2283 {
2284     enum PixelFormat dstFormat = c->dstFormat;
2285
2286     if (is16BPS(dstFormat)) {
2287         *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c  : yuv2planeX_16LE_c;
2288         *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c  : yuv2plane1_16LE_c;
2289     } else if (is9_OR_10BPS(dstFormat)) {
2290         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2291             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c  : yuv2planeX_9LE_c;
2292             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c  : yuv2plane1_9LE_c;
2293         } else {
2294             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c  : yuv2planeX_10LE_c;
2295             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c  : yuv2plane1_10LE_c;
2296         }
2297     } else {
2298         *yuv2plane1 = yuv2plane1_8_c;
2299         *yuv2planeX = yuv2planeX_8_c;
2300         if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)
2301             *yuv2nv12cX = yuv2nv12cX_c;
2302     }
2303
2304     if(c->flags & SWS_FULL_CHR_H_INT) {
2305         switch (dstFormat) {
2306             case PIX_FMT_RGBA:
2307 #if CONFIG_SMALL
2308                 *yuv2packedX = yuv2rgba32_full_X_c;
2309 #else
2310 #if CONFIG_SWSCALE_ALPHA
2311                 if (c->alpPixBuf) {
2312                     *yuv2packedX = yuv2rgba32_full_X_c;
2313                 } else
2314 #endif /* CONFIG_SWSCALE_ALPHA */
2315                 {
2316                     *yuv2packedX = yuv2rgbx32_full_X_c;
2317                 }
2318 #endif /* !CONFIG_SMALL */
2319                 break;
2320             case PIX_FMT_ARGB:
2321 #if CONFIG_SMALL
2322                 *yuv2packedX = yuv2argb32_full_X_c;
2323 #else
2324 #if CONFIG_SWSCALE_ALPHA
2325                 if (c->alpPixBuf) {
2326                     *yuv2packedX = yuv2argb32_full_X_c;
2327                 } else
2328 #endif /* CONFIG_SWSCALE_ALPHA */
2329                 {
2330                     *yuv2packedX = yuv2xrgb32_full_X_c;
2331                 }
2332 #endif /* !CONFIG_SMALL */
2333                 break;
2334             case PIX_FMT_BGRA:
2335 #if CONFIG_SMALL
2336                 *yuv2packedX = yuv2bgra32_full_X_c;
2337 #else
2338 #if CONFIG_SWSCALE_ALPHA
2339                 if (c->alpPixBuf) {
2340                     *yuv2packedX = yuv2bgra32_full_X_c;
2341                 } else
2342 #endif /* CONFIG_SWSCALE_ALPHA */
2343                 {
2344                     *yuv2packedX = yuv2bgrx32_full_X_c;
2345                 }
2346 #endif /* !CONFIG_SMALL */
2347                 break;
2348             case PIX_FMT_ABGR:
2349 #if CONFIG_SMALL
2350                 *yuv2packedX = yuv2abgr32_full_X_c;
2351 #else
2352 #if CONFIG_SWSCALE_ALPHA
2353                 if (c->alpPixBuf) {
2354                     *yuv2packedX = yuv2abgr32_full_X_c;
2355                 } else
2356 #endif /* CONFIG_SWSCALE_ALPHA */
2357                 {
2358                     *yuv2packedX = yuv2xbgr32_full_X_c;
2359                 }
2360 #endif /* !CONFIG_SMALL */
2361                 break;
2362             case PIX_FMT_RGB24:
2363             *yuv2packedX = yuv2rgb24_full_X_c;
2364             break;
2365         case PIX_FMT_BGR24:
2366             *yuv2packedX = yuv2bgr24_full_X_c;
2367             break;
2368         }
2369         if(!*yuv2packedX)
2370             goto YUV_PACKED;
2371     } else {
2372         YUV_PACKED:
2373         switch (dstFormat) {
2374         case PIX_FMT_GRAY16BE:
2375             *yuv2packed1 = yuv2gray16BE_1_c;
2376             *yuv2packed2 = yuv2gray16BE_2_c;
2377             *yuv2packedX = yuv2gray16BE_X_c;
2378             break;
2379         case PIX_FMT_GRAY16LE:
2380             *yuv2packed1 = yuv2gray16LE_1_c;
2381             *yuv2packed2 = yuv2gray16LE_2_c;
2382             *yuv2packedX = yuv2gray16LE_X_c;
2383             break;
2384         case PIX_FMT_MONOWHITE:
2385             *yuv2packed1 = yuv2monowhite_1_c;
2386             *yuv2packed2 = yuv2monowhite_2_c;
2387             *yuv2packedX = yuv2monowhite_X_c;
2388             break;
2389         case PIX_FMT_MONOBLACK:
2390             *yuv2packed1 = yuv2monoblack_1_c;
2391             *yuv2packed2 = yuv2monoblack_2_c;
2392             *yuv2packedX = yuv2monoblack_X_c;
2393             break;
2394         case PIX_FMT_YUYV422:
2395             *yuv2packed1 = yuv2yuyv422_1_c;
2396             *yuv2packed2 = yuv2yuyv422_2_c;
2397             *yuv2packedX = yuv2yuyv422_X_c;
2398             break;
2399         case PIX_FMT_UYVY422:
2400             *yuv2packed1 = yuv2uyvy422_1_c;
2401             *yuv2packed2 = yuv2uyvy422_2_c;
2402             *yuv2packedX = yuv2uyvy422_X_c;
2403             break;
2404         case PIX_FMT_RGB48LE:
2405             *yuv2packed1 = yuv2rgb48le_1_c;
2406             *yuv2packed2 = yuv2rgb48le_2_c;
2407             *yuv2packedX = yuv2rgb48le_X_c;
2408             break;
2409         case PIX_FMT_RGB48BE:
2410             *yuv2packed1 = yuv2rgb48be_1_c;
2411             *yuv2packed2 = yuv2rgb48be_2_c;
2412             *yuv2packedX = yuv2rgb48be_X_c;
2413             break;
2414         case PIX_FMT_BGR48LE:
2415             *yuv2packed1 = yuv2bgr48le_1_c;
2416             *yuv2packed2 = yuv2bgr48le_2_c;
2417             *yuv2packedX = yuv2bgr48le_X_c;
2418             break;
2419         case PIX_FMT_BGR48BE:
2420             *yuv2packed1 = yuv2bgr48be_1_c;
2421             *yuv2packed2 = yuv2bgr48be_2_c;
2422             *yuv2packedX = yuv2bgr48be_X_c;
2423             break;
2424         case PIX_FMT_RGB32:
2425         case PIX_FMT_BGR32:
2426 #if CONFIG_SMALL
2427             *yuv2packed1 = yuv2rgb32_1_c;
2428             *yuv2packed2 = yuv2rgb32_2_c;
2429             *yuv2packedX = yuv2rgb32_X_c;
2430 #else
2431 #if CONFIG_SWSCALE_ALPHA
2432                 if (c->alpPixBuf) {
2433                     *yuv2packed1 = yuv2rgba32_1_c;
2434                     *yuv2packed2 = yuv2rgba32_2_c;
2435                     *yuv2packedX = yuv2rgba32_X_c;
2436                 } else
2437 #endif /* CONFIG_SWSCALE_ALPHA */
2438                 {
2439                     *yuv2packed1 = yuv2rgbx32_1_c;
2440                     *yuv2packed2 = yuv2rgbx32_2_c;
2441                     *yuv2packedX = yuv2rgbx32_X_c;
2442                 }
2443 #endif /* !CONFIG_SMALL */
2444             break;
2445         case PIX_FMT_RGB32_1:
2446         case PIX_FMT_BGR32_1:
2447 #if CONFIG_SMALL
2448                 *yuv2packed1 = yuv2rgb32_1_1_c;
2449                 *yuv2packed2 = yuv2rgb32_1_2_c;
2450                 *yuv2packedX = yuv2rgb32_1_X_c;
2451 #else
2452 #if CONFIG_SWSCALE_ALPHA
2453                 if (c->alpPixBuf) {
2454                     *yuv2packed1 = yuv2rgba32_1_1_c;
2455                     *yuv2packed2 = yuv2rgba32_1_2_c;
2456                     *yuv2packedX = yuv2rgba32_1_X_c;
2457                 } else
2458 #endif /* CONFIG_SWSCALE_ALPHA */
2459                 {
2460                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2461                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2462                     *yuv2packedX = yuv2rgbx32_1_X_c;
2463                 }
2464 #endif /* !CONFIG_SMALL */
2465                 break;
2466         case PIX_FMT_RGB24:
2467             *yuv2packed1 = yuv2rgb24_1_c;
2468             *yuv2packed2 = yuv2rgb24_2_c;
2469             *yuv2packedX = yuv2rgb24_X_c;
2470             break;
2471         case PIX_FMT_BGR24:
2472             *yuv2packed1 = yuv2bgr24_1_c;
2473             *yuv2packed2 = yuv2bgr24_2_c;
2474             *yuv2packedX = yuv2bgr24_X_c;
2475             break;
2476         case PIX_FMT_RGB565LE:
2477         case PIX_FMT_RGB565BE:
2478         case PIX_FMT_BGR565LE:
2479         case PIX_FMT_BGR565BE:
2480             *yuv2packed1 = yuv2rgb16_1_c;
2481             *yuv2packed2 = yuv2rgb16_2_c;
2482             *yuv2packedX = yuv2rgb16_X_c;
2483             break;
2484         case PIX_FMT_RGB555LE:
2485         case PIX_FMT_RGB555BE:
2486         case PIX_FMT_BGR555LE:
2487         case PIX_FMT_BGR555BE:
2488             *yuv2packed1 = yuv2rgb15_1_c;
2489             *yuv2packed2 = yuv2rgb15_2_c;
2490             *yuv2packedX = yuv2rgb15_X_c;
2491             break;
2492         case PIX_FMT_RGB444LE:
2493         case PIX_FMT_RGB444BE:
2494         case PIX_FMT_BGR444LE:
2495         case PIX_FMT_BGR444BE:
2496             *yuv2packed1 = yuv2rgb12_1_c;
2497             *yuv2packed2 = yuv2rgb12_2_c;
2498             *yuv2packedX = yuv2rgb12_X_c;
2499             break;
2500         case PIX_FMT_RGB8:
2501         case PIX_FMT_BGR8:
2502             *yuv2packed1 = yuv2rgb8_1_c;
2503             *yuv2packed2 = yuv2rgb8_2_c;
2504             *yuv2packedX = yuv2rgb8_X_c;
2505             break;
2506         case PIX_FMT_RGB4:
2507         case PIX_FMT_BGR4:
2508             *yuv2packed1 = yuv2rgb4_1_c;
2509             *yuv2packed2 = yuv2rgb4_2_c;
2510             *yuv2packedX = yuv2rgb4_X_c;
2511             break;
2512         case PIX_FMT_RGB4_BYTE:
2513         case PIX_FMT_BGR4_BYTE:
2514             *yuv2packed1 = yuv2rgb4b_1_c;
2515             *yuv2packed2 = yuv2rgb4b_2_c;
2516             *yuv2packedX = yuv2rgb4b_X_c;
2517             break;
2518         }
2519     }
2520 }
2521
2522 #define DEBUG_SWSCALE_BUFFERS 0
2523 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2524
2525 static int swScale(SwsContext *c, const uint8_t* src[],
2526                    int srcStride[], int srcSliceY,
2527                    int srcSliceH, uint8_t* dst[], int dstStride[])
2528 {
2529     /* load a few things into local vars to make the code more readable? and faster */
2530     const int srcW= c->srcW;
2531     const int dstW= c->dstW;
2532     const int dstH= c->dstH;
2533     const int chrDstW= c->chrDstW;
2534     const int chrSrcW= c->chrSrcW;
2535     const int lumXInc= c->lumXInc;
2536     const int chrXInc= c->chrXInc;
2537     const enum PixelFormat dstFormat= c->dstFormat;
2538     const int flags= c->flags;
2539     int16_t *vLumFilterPos= c->vLumFilterPos;
2540     int16_t *vChrFilterPos= c->vChrFilterPos;
2541     int16_t *hLumFilterPos= c->hLumFilterPos;
2542     int16_t *hChrFilterPos= c->hChrFilterPos;
2543     int16_t *hLumFilter= c->hLumFilter;
2544     int16_t *hChrFilter= c->hChrFilter;
2545     int32_t *lumMmxFilter= c->lumMmxFilter;
2546     int32_t *chrMmxFilter= c->chrMmxFilter;
2547     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2548     const int vLumFilterSize= c->vLumFilterSize;
2549     const int vChrFilterSize= c->vChrFilterSize;
2550     const int hLumFilterSize= c->hLumFilterSize;
2551     const int hChrFilterSize= c->hChrFilterSize;
2552     int16_t **lumPixBuf= c->lumPixBuf;
2553     int16_t **chrUPixBuf= c->chrUPixBuf;
2554     int16_t **chrVPixBuf= c->chrVPixBuf;
2555     int16_t **alpPixBuf= c->alpPixBuf;
2556     const int vLumBufSize= c->vLumBufSize;
2557     const int vChrBufSize= c->vChrBufSize;
2558     uint8_t *formatConvBuffer= c->formatConvBuffer;
2559     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2560     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2561     int lastDstY;
2562     uint32_t *pal=c->pal_yuv;
2563     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2564
2565     yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
2566     yuv2planarX_fn yuv2planeX = c->yuv2planeX;
2567     yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
2568     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2569     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2570     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2571
2572     /* vars which will change and which we need to store back in the context */
2573     int dstY= c->dstY;
2574     int lumBufIndex= c->lumBufIndex;
2575     int chrBufIndex= c->chrBufIndex;
2576     int lastInLumBuf= c->lastInLumBuf;
2577     int lastInChrBuf= c->lastInChrBuf;
2578
2579     if (isPacked(c->srcFormat)) {
2580         src[0]=
2581         src[1]=
2582         src[2]=
2583         src[3]= src[0];
2584         srcStride[0]=
2585         srcStride[1]=
2586         srcStride[2]=
2587         srcStride[3]= srcStride[0];
2588     }
2589     srcStride[1]<<= c->vChrDrop;
2590     srcStride[2]<<= c->vChrDrop;
2591
2592     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2593                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2594                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2595     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2596                    srcSliceY,    srcSliceH,    dstY,    dstH);
2597     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2598                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2599
2600     if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 || dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
2601         static int warnedAlready=0; //FIXME move this into the context perhaps
2602         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2603             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2604                    "         ->cannot do aligned memory accesses anymore\n");
2605             warnedAlready=1;
2606         }
2607     }
2608
2609     if ((int)dst[0]%16 || (int)dst[1]%16 || (int)dst[2]%16 || (int)src[0]%16 || (int)src[1]%16 || (int)src[2]%16
2610         || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
2611         || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
2612     ) {
2613         static int warnedAlready=0;
2614         int cpu_flags = av_get_cpu_flags();
2615         if (HAVE_MMX2 && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
2616             av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
2617             warnedAlready=1;
2618         }
2619     }
2620
2621     /* Note the user might start scaling the picture in the middle so this
2622        will not get executed. This is not really intended but works
2623        currently, so people might do it. */
2624     if (srcSliceY ==0) {
2625         lumBufIndex=-1;
2626         chrBufIndex=-1;
2627         dstY=0;
2628         lastInLumBuf= -1;
2629         lastInChrBuf= -1;
2630     }
2631
2632     if (!should_dither) {
2633         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2634     }
2635     lastDstY= dstY;
2636
2637     for (;dstY < dstH; dstY++) {
2638         const int chrDstY= dstY>>c->chrDstVSubSample;
2639         uint8_t *dest[4] = {
2640             dst[0] + dstStride[0] * dstY,
2641             dst[1] + dstStride[1] * chrDstY,
2642             dst[2] + dstStride[2] * chrDstY,
2643             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2644         };
2645         int use_mmx_vfilter= c->use_mmx_vfilter;
2646
2647         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2648         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2649         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2650         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2651         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2652         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2653         int enough_lines;
2654
2655         //handle holes (FAST_BILINEAR & weird filters)
2656         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2657         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2658         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2659         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2660
2661         DEBUG_BUFFERS("dstY: %d\n", dstY);
2662         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2663                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2664         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2665                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2666
2667         // Do we have enough lines in this slice to output the dstY line
2668         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2669
2670         if (!enough_lines) {
2671             lastLumSrcY = srcSliceY + srcSliceH - 1;
2672             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2673             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2674                                             lastLumSrcY, lastChrSrcY);
2675         }
2676
2677         //Do horizontal scaling
2678         while(lastInLumBuf < lastLumSrcY) {
2679             const uint8_t *src1[4] = {
2680                 src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
2681                 src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
2682                 src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
2683                 src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
2684             };
2685             lumBufIndex++;
2686             assert(lumBufIndex < 2*vLumBufSize);
2687             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2688             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2689             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2690                     hLumFilter, hLumFilterPos, hLumFilterSize,
2691                     formatConvBuffer,
2692                     pal, 0);
2693             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2694                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src1, srcW,
2695                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2696                         formatConvBuffer,
2697                         pal, 1);
2698             lastInLumBuf++;
2699             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2700                                lumBufIndex,    lastInLumBuf);
2701         }
2702         while(lastInChrBuf < lastChrSrcY) {
2703             const uint8_t *src1[4] = {
2704                 src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
2705                 src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
2706                 src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
2707                 src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
2708             };
2709             chrBufIndex++;
2710             assert(chrBufIndex < 2*vChrBufSize);
2711             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2712             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2713             //FIXME replace parameters through context struct (some at least)
2714
2715             if (c->needs_hcscale)
2716                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2717                           chrDstW, src1, chrSrcW, chrXInc,
2718                           hChrFilter, hChrFilterPos, hChrFilterSize,
2719                           formatConvBuffer, pal);
2720             lastInChrBuf++;
2721             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2722                                chrBufIndex,    lastInChrBuf);
2723         }
2724         //wrap buf index around to stay inside the ring buffer
2725         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2726         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2727         if (!enough_lines)
2728             break; //we can't output a dstY line so let's try with the next slice
2729
2730 #if HAVE_MMX
2731         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2732 #endif
2733         if (should_dither) {
2734             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2735             c->lumDither8 = dither_8x8_128[dstY & 7];
2736         }
2737         if (dstY >= dstH-2) {
2738             // hmm looks like we can't use MMX here without overwriting this array's tail
2739             find_c_packed_planar_out_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
2740                                            &yuv2packed1, &yuv2packed2, &yuv2packedX);
2741             use_mmx_vfilter= 0;
2742         }
2743
2744         {
2745             const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2746             const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2747             const int16_t **chrVSrcPtr= (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2748             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2749             int16_t *vLumFilter= c->vLumFilter;
2750             int16_t *vChrFilter= c->vChrFilter;
2751
2752             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2753                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2754
2755                 vLumFilter +=    dstY * vLumFilterSize;
2756                 vChrFilter += chrDstY * vChrFilterSize;
2757
2758                 av_assert0(use_mmx_vfilter != (
2759                                yuv2planeX == yuv2planeX_10BE_c
2760                             || yuv2planeX == yuv2planeX_10LE_c
2761                             || yuv2planeX == yuv2planeX_9BE_c
2762                             || yuv2planeX == yuv2planeX_9LE_c
2763                             || yuv2planeX == yuv2planeX_16BE_c
2764                             || yuv2planeX == yuv2planeX_16LE_c
2765                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
2766
2767                 if(use_mmx_vfilter){
2768                     vLumFilter= c->lumMmxFilter;
2769                     vChrFilter= c->chrMmxFilter;
2770                 }
2771
2772                 if (vLumFilterSize == 1) {
2773                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
2774                 } else {
2775                     yuv2planeX(vLumFilter, vLumFilterSize,
2776                                lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
2777                 }
2778
2779                 if (!((dstY&chrSkipMask) || isGray(dstFormat))) {
2780                     if (yuv2nv12cX) {
2781                         yuv2nv12cX(c, vChrFilter, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
2782                     } else if (vChrFilterSize == 1) {
2783                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
2784                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
2785                     } else {
2786                         yuv2planeX(vChrFilter, vChrFilterSize,
2787                                    chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
2788                         yuv2planeX(vChrFilter, vChrFilterSize,
2789                                    chrVSrcPtr, dest[2], chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
2790                     }
2791                 }
2792
2793                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2794                     if(use_mmx_vfilter){
2795                         vLumFilter= c->alpMmxFilter;
2796                     }
2797                     if (vLumFilterSize == 1) {
2798                         yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0);
2799                     } else {
2800                         yuv2planeX(vLumFilter, vLumFilterSize,
2801                                    alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
2802                     }
2803                 }
2804             } else {
2805                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2806                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2807                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2808                     int chrAlpha = vChrFilter[2 * dstY + 1];
2809                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2810                                 alpPixBuf ? *alpSrcPtr : NULL,
2811                                 dest[0], dstW, chrAlpha, dstY);
2812                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2813                     int lumAlpha = vLumFilter[2 * dstY + 1];
2814                     int chrAlpha = vChrFilter[2 * dstY + 1];
2815                     lumMmxFilter[2] =
2816                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2817                     chrMmxFilter[2] =
2818                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2819                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2820                                 alpPixBuf ? alpSrcPtr : NULL,
2821                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2822                 } else { //general RGB
2823                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2824                                 lumSrcPtr, vLumFilterSize,
2825                                 vChrFilter + dstY * vChrFilterSize,
2826                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2827                                 alpSrcPtr, dest[0], dstW, dstY);
2828                 }
2829             }
2830         }
2831     }
2832
2833     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2834         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2835
2836 #if HAVE_MMX2
2837     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2838         __asm__ volatile("sfence":::"memory");
2839 #endif
2840     emms_c();
2841
2842     /* store changed local vars back in the context */
2843     c->dstY= dstY;
2844     c->lumBufIndex= lumBufIndex;
2845     c->chrBufIndex= chrBufIndex;
2846     c->lastInLumBuf= lastInLumBuf;
2847     c->lastInChrBuf= lastInChrBuf;
2848
2849     return dstY - lastDstY;
2850 }
2851
2852 static av_cold void sws_init_swScale_c(SwsContext *c)
2853 {
2854     enum PixelFormat srcFormat = c->srcFormat;
2855
2856     find_c_packed_planar_out_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
2857                                    &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2,
2858                                    &c->yuv2packedX);
2859
2860     c->chrToYV12 = NULL;
2861     switch(srcFormat) {
2862         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2863         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2864         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2865         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2866         case PIX_FMT_RGB8     :
2867         case PIX_FMT_BGR8     :
2868         case PIX_FMT_PAL8     :
2869         case PIX_FMT_BGR4_BYTE:
2870         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2871         case PIX_FMT_GBRP9LE:
2872         case PIX_FMT_GBRP10LE:
2873         case PIX_FMT_GBRP16LE:  c->readChrPlanar = planar_rgb16le_to_uv; break;
2874         case PIX_FMT_GBRP9BE:
2875         case PIX_FMT_GBRP10BE:
2876         case PIX_FMT_GBRP16BE:  c->readChrPlanar = planar_rgb16be_to_uv; break;
2877         case PIX_FMT_GBRP:      c->readChrPlanar = planar_rgb_to_uv; break;
2878 #if HAVE_BIGENDIAN
2879         case PIX_FMT_YUV444P9LE:
2880         case PIX_FMT_YUV422P9LE:
2881         case PIX_FMT_YUV420P9LE:
2882         case PIX_FMT_YUV422P10LE:
2883         case PIX_FMT_YUV420P10LE:
2884         case PIX_FMT_YUV444P10LE:
2885         case PIX_FMT_YUV420P16LE:
2886         case PIX_FMT_YUV422P16LE:
2887         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2888 #else
2889         case PIX_FMT_YUV444P9BE:
2890         case PIX_FMT_YUV422P9BE:
2891         case PIX_FMT_YUV420P9BE:
2892         case PIX_FMT_YUV444P10BE:
2893         case PIX_FMT_YUV422P10BE:
2894         case PIX_FMT_YUV420P10BE:
2895         case PIX_FMT_YUV420P16BE:
2896         case PIX_FMT_YUV422P16BE:
2897         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2898 #endif
2899     }
2900     if (c->chrSrcHSubSample) {
2901         switch(srcFormat) {
2902         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2903         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2904         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2905         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2906         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2907         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2908         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2909         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2910         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2911         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2912         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2913         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2914         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2915         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2916         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2917         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2918         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2919         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2920         case PIX_FMT_GBR24P  : c->chrToYV12 = gbr24pToUV_half_c;  break;
2921         }
2922     } else {
2923         switch(srcFormat) {
2924         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2925         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2926         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2927         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2928         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2929         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2930         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2931         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2932         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2933         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2934         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2935         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2936         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2937         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2938         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2939         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2940         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2941         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2942         }
2943     }
2944
2945     c->lumToYV12 = NULL;
2946     c->alpToYV12 = NULL;
2947     switch (srcFormat) {
2948     case PIX_FMT_GBRP9LE:
2949     case PIX_FMT_GBRP10LE:
2950     case PIX_FMT_GBRP16LE: c->readLumPlanar = planar_rgb16le_to_y; break;
2951     case PIX_FMT_GBRP9BE:
2952     case PIX_FMT_GBRP10BE:
2953     case PIX_FMT_GBRP16BE: c->readLumPlanar = planar_rgb16be_to_y; break;
2954     case PIX_FMT_GBRP:     c->readLumPlanar = planar_rgb_to_y; break;
2955 #if HAVE_BIGENDIAN
2956     case PIX_FMT_YUV444P9LE:
2957     case PIX_FMT_YUV422P9LE:
2958     case PIX_FMT_YUV420P9LE:
2959     case PIX_FMT_YUV422P10LE:
2960     case PIX_FMT_YUV420P10LE:
2961     case PIX_FMT_YUV444P10LE:
2962     case PIX_FMT_YUV420P16LE:
2963     case PIX_FMT_YUV422P16LE:
2964     case PIX_FMT_YUV444P16LE:
2965     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2966 #else
2967     case PIX_FMT_YUV444P9BE:
2968     case PIX_FMT_YUV422P9BE:
2969     case PIX_FMT_YUV420P9BE:
2970     case PIX_FMT_YUV444P10BE:
2971     case PIX_FMT_YUV422P10BE:
2972     case PIX_FMT_YUV420P10BE:
2973     case PIX_FMT_YUV420P16BE:
2974     case PIX_FMT_YUV422P16BE:
2975     case PIX_FMT_YUV444P16BE:
2976     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2977 #endif
2978     case PIX_FMT_YUYV422  :
2979     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2980     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2981     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2982     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2983     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2984     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2985     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2986     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2987     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2988     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2989     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2990     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2991     case PIX_FMT_RGB8     :
2992     case PIX_FMT_BGR8     :
2993     case PIX_FMT_PAL8     :
2994     case PIX_FMT_BGR4_BYTE:
2995     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2996     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2997     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2998     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2999     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
3000     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
3001     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
3002     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
3003     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
3004     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
3005     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
3006     }
3007     if (c->alpPixBuf) {
3008         switch (srcFormat) {
3009         case PIX_FMT_BGRA:
3010         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
3011         case PIX_FMT_ABGR:
3012         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
3013         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
3014         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
3015         }
3016     }
3017
3018
3019     if (c->srcBpc == 8) {
3020         if (c->dstBpc <= 10) {
3021             c->hyScale = c->hcScale = hScale8To15_c;
3022             if (c->flags & SWS_FAST_BILINEAR) {
3023                 c->hyscale_fast = hyscale_fast_c;
3024                 c->hcscale_fast = hcscale_fast_c;
3025             }
3026         } else {
3027             c->hyScale = c->hcScale = hScale8To19_c;
3028         }
3029     } else {
3030         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
3031     }
3032
3033     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3034         if (c->dstBpc <= 10) {
3035             if (c->srcRange) {
3036                 c->lumConvertRange = lumRangeFromJpeg_c;
3037                 c->chrConvertRange = chrRangeFromJpeg_c;
3038             } else {
3039                 c->lumConvertRange = lumRangeToJpeg_c;
3040                 c->chrConvertRange = chrRangeToJpeg_c;
3041             }
3042         } else {
3043             if (c->srcRange) {
3044                 c->lumConvertRange = lumRangeFromJpeg16_c;
3045                 c->chrConvertRange = chrRangeFromJpeg16_c;
3046             } else {
3047                 c->lumConvertRange = lumRangeToJpeg16_c;
3048                 c->chrConvertRange = chrRangeToJpeg16_c;
3049             }
3050         }
3051     }
3052
3053     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3054           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3055         c->needs_hcscale = 1;
3056 }
3057
3058 SwsFunc ff_getSwsFunc(SwsContext *c)
3059 {
3060     sws_init_swScale_c(c);
3061
3062     if (HAVE_MMX)
3063         ff_sws_init_swScale_mmx(c);
3064     if (HAVE_ALTIVEC)
3065         ff_sws_init_swScale_altivec(c);
3066
3067     return swScale;
3068 }