git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/avassert.h"
  64 #include "libavutil/intreadwrite.h"
  65 #include "libavutil/cpu.h"
  66 #include "libavutil/avutil.h"
  67 #include "libavutil/mathematics.h"
  68 #include "libavutil/bswap.h"
  69 #include "libavutil/pixdesc.h"
  70
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 186 {  36, 68, 60, 92, 34, 66, 58, 90,},
 187 { 100,  4,124, 28, 98,  2,122, 26,},
 188 {  52, 84, 44, 76, 50, 82, 42, 74,},
 189 { 116, 20,108, 12,114, 18,106, 10,},
 190 {  32, 64, 56, 88, 38, 70, 62, 94,},
 191 {  96,  0,120, 24,102,  6,126, 30,},
 192 {  48, 80, 40, 72, 54, 86, 46, 78,},
 193 { 112, 16,104,  8,118, 22,110, 14,},
 194 };
 195 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 196 {  64, 64, 64, 64, 64, 64, 64, 64 };
 197
 198 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 199 {
 200   {   0,  1,  0,  1,  0,  1,  0,  1,},
 201   {   1,  0,  1,  0,  1,  0,  1,  0,},
 202   {   0,  1,  0,  1,  0,  1,  0,  1,},
 203   {   1,  0,  1,  0,  1,  0,  1,  0,},
 204   {   0,  1,  0,  1,  0,  1,  0,  1,},
 205   {   1,  0,  1,  0,  1,  0,  1,  0,},
 206   {   0,  1,  0,  1,  0,  1,  0,  1,},
 207   {   1,  0,  1,  0,  1,  0,  1,  0,},
 208 },{
 209   {   1,  2,  1,  2,  1,  2,  1,  2,},
 210   {   3,  0,  3,  0,  3,  0,  3,  0,},
 211   {   1,  2,  1,  2,  1,  2,  1,  2,},
 212   {   3,  0,  3,  0,  3,  0,  3,  0,},
 213   {   1,  2,  1,  2,  1,  2,  1,  2,},
 214   {   3,  0,  3,  0,  3,  0,  3,  0,},
 215   {   1,  2,  1,  2,  1,  2,  1,  2,},
 216   {   3,  0,  3,  0,  3,  0,  3,  0,},
 217 },{
 218   {   2,  4,  3,  5,  2,  4,  3,  5,},
 219   {   6,  0,  7,  1,  6,  0,  7,  1,},
 220   {   3,  5,  2,  4,  3,  5,  2,  4,},
 221   {   7,  1,  6,  0,  7,  1,  6,  0,},
 222   {   2,  4,  3,  5,  2,  4,  3,  5,},
 223   {   6,  0,  7,  1,  6,  0,  7,  1,},
 224   {   3,  5,  2,  4,  3,  5,  2,  4,},
 225   {   7,  1,  6,  0,  7,  1,  6,  0,},
 226 },{
 227   {   4,  8,  7, 11,  4,  8,  7, 11,},
 228   {  12,  0, 15,  3, 12,  0, 15,  3,},
 229   {   6, 10,  5,  9,  6, 10,  5,  9,},
 230   {  14,  2, 13,  1, 14,  2, 13,  1,},
 231   {   4,  8,  7, 11,  4,  8,  7, 11,},
 232   {  12,  0, 15,  3, 12,  0, 15,  3,},
 233   {   6, 10,  5,  9,  6, 10,  5,  9,},
 234   {  14,  2, 13,  1, 14,  2, 13,  1,},
 235 },{
 236   {   9, 17, 15, 23,  8, 16, 14, 22,},
 237   {  25,  1, 31,  7, 24,  0, 30,  6,},
 238   {  13, 21, 11, 19, 12, 20, 10, 18,},
 239   {  29,  5, 27,  3, 28,  4, 26,  2,},
 240   {   8, 16, 14, 22,  9, 17, 15, 23,},
 241   {  24,  0, 30,  6, 25,  1, 31,  7,},
 242   {  12, 20, 10, 18, 13, 21, 11, 19,},
 243   {  28,  4, 26,  2, 29,  5, 27,  3,},
 244 },{
 245   {  18, 34, 30, 46, 17, 33, 29, 45,},
 246   {  50,  2, 62, 14, 49,  1, 61, 13,},
 247   {  26, 42, 22, 38, 25, 41, 21, 37,},
 248   {  58, 10, 54,  6, 57,  9, 53,  5,},
 249   {  16, 32, 28, 44, 19, 35, 31, 47,},
 250   {  48,  0, 60, 12, 51,  3, 63, 15,},
 251   {  24, 40, 20, 36, 27, 43, 23, 39,},
 252   {  56,  8, 52,  4, 59, 11, 55,  7,},
 253 },{
 254   {  18, 34, 30, 46, 17, 33, 29, 45,},
 255   {  50,  2, 62, 14, 49,  1, 61, 13,},
 256   {  26, 42, 22, 38, 25, 41, 21, 37,},
 257   {  58, 10, 54,  6, 57,  9, 53,  5,},
 258   {  16, 32, 28, 44, 19, 35, 31, 47,},
 259   {  48,  0, 60, 12, 51,  3, 63, 15,},
 260   {  24, 40, 20, 36, 27, 43, 23, 39,},
 261   {  56,  8, 52,  4, 59, 11, 55,  7,},
 262 },{
 263   {  36, 68, 60, 92, 34, 66, 58, 90,},
 264   { 100,  4,124, 28, 98,  2,122, 26,},
 265   {  52, 84, 44, 76, 50, 82, 42, 74,},
 266   { 116, 20,108, 12,114, 18,106, 10,},
 267   {  32, 64, 56, 88, 38, 70, 62, 94,},
 268   {  96,  0,120, 24,102,  6,126, 30,},
 269   {  48, 80, 40, 72, 54, 86, 46, 78,},
 270   { 112, 16,104,  8,118, 22,110, 14,},
 271 }};
 272
 273 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 274
 275 const uint16_t dither_scale[15][16]={
 276 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 277 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 278 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 279 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 280 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 281 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 282 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 283 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 284 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 285 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 286 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 287 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 288 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 289 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 290 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 291 };
 292
 293 #define output_pixel(pos, val, bias, signedness) \
 294     if (big_endian) { \
 295         AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 296     } else { \
 297         AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
 298     }
 299
 300 static av_always_inline void
 301 yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
 302                          int big_endian, int output_bits)
 303 {
 304     int i;
 305     int shift = 3;
 306     av_assert0(output_bits == 16);
 307
 308     for (i = 0; i < dstW; i++) {
 309         int val = src[i] + (1 << (shift - 1));
 310         output_pixel(&dest[i], val, 0, uint);
 311     }
 312 }
 313
 314 static av_always_inline void
 315 yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
 316                          const int32_t **src, uint16_t *dest, int dstW,
 317                          int big_endian, int output_bits)
 318 {
 319     int i;
 320     int shift = 15;
 321     av_assert0(output_bits == 16);
 322
 323     for (i = 0; i < dstW; i++) {
 324         int val = 1 << (shift - 1);
 325         int j;
 326
 327         /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
 328          * filters (or anything with negative coeffs, the range can be slightly
 329          * wider in both directions. To account for this overflow, we subtract
 330          * a constant so it always fits in the signed range (assuming a
 331          * reasonable filterSize), and re-add that at the end. */
 332         val -= 0x40000000;
 333         for (j = 0; j < filterSize; j++)
 334             val += src[j][i] * filter[j];
 335
 336         output_pixel(&dest[i], val, 0x8000, int);
 337     }
 338 }
 339
 340 #undef output_pixel
 341
 342 #define output_pixel(pos, val) \
 343     if (big_endian) { \
 344         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 345     } else { \
 346         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 347     }
 348
 349 static av_always_inline void
 350 yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
 351                          int big_endian, int output_bits)
 352 {
 353     int i;
 354     int shift = 15 - output_bits;
 355
 356     for (i = 0; i < dstW; i++) {
 357         int val = src[i] + (1 << (shift - 1));
 358         output_pixel(&dest[i], val);
 359     }
 360 }
 361
 362 static av_always_inline void
 363 yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
 364                          const int16_t **src, uint16_t *dest, int dstW,
 365                          int big_endian, int output_bits)
 366 {
 367     int i;
 368     int shift = 11 + 16 - output_bits;
 369
 370     for (i = 0; i < dstW; i++) {
 371         int val = 1 << (shift - 1);
 372         int j;
 373
 374         for (j = 0; j < filterSize; j++)
 375             val += src[j][i] * filter[j];
 376
 377         output_pixel(&dest[i], val);
 378     }
 379 }
 380
 381 #undef output_pixel
 382
 383 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
 384 static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
 385                               uint8_t *dest, int dstW, \
 386                               const uint8_t *dither, int offset)\
 387 { \
 388     yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
 389                          (uint16_t *) dest, dstW, is_be, bits); \
 390 }\
 391 static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
 392                               const int16_t **src, uint8_t *dest, int dstW, \
 393                               const uint8_t *dither, int offset)\
 394 { \
 395     yuv2planeX_## template_size ## _c_template(filter, \
 396                          filterSize, (const typeX_t **) src, \
 397                          (uint16_t *) dest, dstW, is_be, bits); \
 398 }
 399 yuv2NBPS( 9, BE, 1, 10, int16_t);
 400 yuv2NBPS( 9, LE, 0, 10, int16_t);
 401 yuv2NBPS(10, BE, 1, 10, int16_t);
 402 yuv2NBPS(10, LE, 0, 10, int16_t);
 403 yuv2NBPS(16, BE, 1, 16, int32_t);
 404 yuv2NBPS(16, LE, 0, 16, int32_t);
 405
 406 static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
 407                            const int16_t **src, uint8_t *dest, int dstW,
 408                            const uint8_t *dither, int offset)
 409 {
 410     int i;
 411     for (i=0; i<dstW; i++) {
 412         int val = dither[(i + offset) & 7] << 12;
 413         int j;
 414         for (j=0; j<filterSize; j++)
 415             val += src[j][i] * filter[j];
 416
 417         dest[i]= av_clip_uint8(val>>19);
 418     }
 419 }
 420
 421 static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
 422                            const uint8_t *dither, int offset)
 423 {
 424     int i;
 425     for (i=0; i<dstW; i++) {
 426         int val = (src[i] + dither[(i + offset) & 7]) >> 7;
 427         dest[i]= av_clip_uint8(val);
 428     }
 429 }
 430
 431 static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
 432                         const int16_t **chrUSrc, const int16_t **chrVSrc,
 433                         uint8_t *dest, int chrDstW)
 434 {
 435     enum PixelFormat dstFormat = c->dstFormat;
 436     const uint8_t *chrDither = c->chrDither8;
 437     int i;
 438
 439     if (dstFormat == PIX_FMT_NV12)
 440         for (i=0; i<chrDstW; i++) {
 441             int u = chrDither[i & 7] << 12;
 442             int v = chrDither[(i + 3) & 7] << 12;
 443             int j;
 444             for (j=0; j<chrFilterSize; j++) {
 445                 u += chrUSrc[j][i] * chrFilter[j];
 446                 v += chrVSrc[j][i] * chrFilter[j];
 447             }
 448
 449             dest[2*i]= av_clip_uint8(u>>19);
 450             dest[2*i+1]= av_clip_uint8(v>>19);
 451         }
 452     else
 453         for (i=0; i<chrDstW; i++) {
 454             int u = chrDither[i & 7] << 12;
 455             int v = chrDither[(i + 3) & 7] << 12;
 456             int j;
 457             for (j=0; j<chrFilterSize; j++) {
 458                 u += chrUSrc[j][i] * chrFilter[j];
 459                 v += chrVSrc[j][i] * chrFilter[j];
 460             }
 461
 462             dest[2*i]= av_clip_uint8(v>>19);
 463             dest[2*i+1]= av_clip_uint8(u>>19);
 464         }
 465 }
 466
 467 #define output_pixel(pos, val) \
 468         if (target == PIX_FMT_GRAY16BE) { \
 469             AV_WB16(pos, val); \
 470         } else { \
 471             AV_WL16(pos, val); \
 472         }
 473
 474 static av_always_inline void
 475 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 476                         const int32_t **lumSrc, int lumFilterSize,
 477                         const int16_t *chrFilter, const int32_t **chrUSrc,
 478                         const int32_t **chrVSrc, int chrFilterSize,
 479                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 480                         int y, enum PixelFormat target)
 481 {
 482     int i;
 483
 484     for (i = 0; i < (dstW >> 1); i++) {
 485         int j;
 486         int Y1 = 1 << 14;
 487         int Y2 = 1 << 14;
 488
 489         for (j = 0; j < lumFilterSize; j++) {
 490             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 491             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 492         }
 493         Y1 >>= 15;
 494         Y2 >>= 15;
 495         if ((Y1 | Y2) & 0x10000) {
 496             Y1 = av_clip_uint16(Y1);
 497             Y2 = av_clip_uint16(Y2);
 498         }
 499         output_pixel(&dest[i * 2 + 0], Y1);
 500         output_pixel(&dest[i * 2 + 1], Y2);
 501     }
 502 }
 503
 504 static av_always_inline void
 505 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 506                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 507                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 508                         int yalpha, int uvalpha, int y,
 509                         enum PixelFormat target)
 510 {
 511     int  yalpha1 = 4095 - yalpha;
 512     int i;
 513     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 514
 515     for (i = 0; i < (dstW >> 1); i++) {
 516         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 517         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 518
 519         output_pixel(&dest[i * 2 + 0], Y1);
 520         output_pixel(&dest[i * 2 + 1], Y2);
 521     }
 522 }
 523
 524 static av_always_inline void
 525 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 526                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 527                         const int32_t *abuf0, uint16_t *dest, int dstW,
 528                         int uvalpha, int y, enum PixelFormat target)
 529 {
 530     int i;
 531
 532     for (i = 0; i < (dstW >> 1); i++) {
 533         int Y1 = (buf0[i * 2    ]+4)>>3;
 534         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 535
 536         output_pixel(&dest[i * 2 + 0], Y1);
 537         output_pixel(&dest[i * 2 + 1], Y2);
 538     }
 539 }
 540
 541 #undef output_pixel
 542
 543 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 544 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 545                         const int16_t **_lumSrc, int lumFilterSize, \
 546                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 547                         const int16_t **_chrVSrc, int chrFilterSize, \
 548                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 549                         int y) \
 550 { \
 551     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 552                   **chrUSrc = (const int32_t **) _chrUSrc, \
 553                   **chrVSrc = (const int32_t **) _chrVSrc, \
 554                   **alpSrc  = (const int32_t **) _alpSrc; \
 555     uint16_t *dest = (uint16_t *) _dest; \
 556     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 557                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 558                           alpSrc, dest, dstW, y, fmt); \
 559 } \
 560  \
 561 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 562                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 563                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 564                         int yalpha, int uvalpha, int y) \
 565 { \
 566     const int32_t **buf  = (const int32_t **) _buf, \
 567                   **ubuf = (const int32_t **) _ubuf, \
 568                   **vbuf = (const int32_t **) _vbuf, \
 569                   **abuf = (const int32_t **) _abuf; \
 570     uint16_t *dest = (uint16_t *) _dest; \
 571     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 572                           dest, dstW, yalpha, uvalpha, y, fmt); \
 573 } \
 574  \
 575 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 576                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 577                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 578                         int uvalpha, int y) \
 579 { \
 580     const int32_t *buf0  = (const int32_t *)  _buf0, \
 581                  **ubuf  = (const int32_t **) _ubuf, \
 582                  **vbuf  = (const int32_t **) _vbuf, \
 583                   *abuf0 = (const int32_t *)  _abuf0; \
 584     uint16_t *dest = (uint16_t *) _dest; \
 585     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 586                                   dstW, uvalpha, y, fmt); \
 587 }
 588
 589 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 590 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 591
 592 #define output_pixel(pos, acc) \
 593     if (target == PIX_FMT_MONOBLACK) { \
 594         pos = acc; \
 595     } else { \
 596         pos = ~acc; \
 597     }
 598
 599 static av_always_inline void
 600 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 601                       const int16_t **lumSrc, int lumFilterSize,
 602                       const int16_t *chrFilter, const int16_t **chrUSrc,
 603                       const int16_t **chrVSrc, int chrFilterSize,
 604                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 605                       int y, enum PixelFormat target)
 606 {
 607     const uint8_t * const d128=dither_8x8_220[y&7];
 608     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 609     int i;
 610     int acc = 0;
 611
 612     for (i = 0; i < dstW - 1; i += 2) {
 613         int j;
 614         int Y1 = 1 << 18;
 615         int Y2 = 1 << 18;
 616
 617         for (j = 0; j < lumFilterSize; j++) {
 618             Y1 += lumSrc[j][i]   * lumFilter[j];
 619             Y2 += lumSrc[j][i+1] * lumFilter[j];
 620         }
 621         Y1 >>= 19;
 622         Y2 >>= 19;
 623         if ((Y1 | Y2) & 0x100) {
 624             Y1 = av_clip_uint8(Y1);
 625             Y2 = av_clip_uint8(Y2);
 626         }
 627         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 628         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 629         if ((i & 7) == 6) {
 630             output_pixel(*dest++, acc);
 631         }
 632     }
 633 }
 634
 635 static av_always_inline void
 636 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 637                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 638                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 639                       int yalpha, int uvalpha, int y,
 640                       enum PixelFormat target)
 641 {
 642     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 643     const uint8_t * const d128 = dither_8x8_220[y & 7];
 644     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 645     int  yalpha1 = 4095 - yalpha;
 646     int i;
 647
 648     for (i = 0; i < dstW - 7; i += 8) {
 649         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 650         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 651         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 652         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 653         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 654         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 655         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 656         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 657         output_pixel(*dest++, acc);
 658     }
 659 }
 660
 661 static av_always_inline void
 662 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 663                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 664                       const int16_t *abuf0, uint8_t *dest, int dstW,
 665                       int uvalpha, int y, enum PixelFormat target)
 666 {
 667     const uint8_t * const d128 = dither_8x8_220[y & 7];
 668     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 669     int i;
 670
 671     for (i = 0; i < dstW - 7; i += 8) {
 672         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 673         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 674         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 675         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 676         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 677         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 678         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 679         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 680         output_pixel(*dest++, acc);
 681     }
 682 }
 683
 684 #undef output_pixel
 685
 686 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 687 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 688                                 const int16_t **lumSrc, int lumFilterSize, \
 689                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 690                                 const int16_t **chrVSrc, int chrFilterSize, \
 691                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 692                                 int y) \
 693 { \
 694     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 695                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 696                                   alpSrc, dest, dstW, y, fmt); \
 697 } \
 698  \
 699 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 700                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 701                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 702                                 int yalpha, int uvalpha, int y) \
 703 { \
 704     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 705                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 706 } \
 707  \
 708 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 709                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 710                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 711                                 int uvalpha, int y) \
 712 { \
 713     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 714                                   abuf0, dest, dstW, uvalpha, \
 715                                   y, fmt); \
 716 }
 717
 718 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 719 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 720
 721 #define output_pixels(pos, Y1, U, Y2, V) \
 722     if (target == PIX_FMT_YUYV422) { \
 723         dest[pos + 0] = Y1; \
 724         dest[pos + 1] = U;  \
 725         dest[pos + 2] = Y2; \
 726         dest[pos + 3] = V;  \
 727     } else { \
 728         dest[pos + 0] = U;  \
 729         dest[pos + 1] = Y1; \
 730         dest[pos + 2] = V;  \
 731         dest[pos + 3] = Y2; \
 732     }
 733
 734 static av_always_inline void
 735 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 736                      const int16_t **lumSrc, int lumFilterSize,
 737                      const int16_t *chrFilter, const int16_t **chrUSrc,
 738                      const int16_t **chrVSrc, int chrFilterSize,
 739                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 740                      int y, enum PixelFormat target)
 741 {
 742     int i;
 743
 744     for (i = 0; i < (dstW >> 1); i++) {
 745         int j;
 746         int Y1 = 1 << 18;
 747         int Y2 = 1 << 18;
 748         int U  = 1 << 18;
 749         int V  = 1 << 18;
 750
 751         for (j = 0; j < lumFilterSize; j++) {
 752             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 753             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 754         }
 755         for (j = 0; j < chrFilterSize; j++) {
 756             U += chrUSrc[j][i] * chrFilter[j];
 757             V += chrVSrc[j][i] * chrFilter[j];
 758         }
 759         Y1 >>= 19;
 760         Y2 >>= 19;
 761         U  >>= 19;
 762         V  >>= 19;
 763         if ((Y1 | Y2 | U | V) & 0x100) {
 764             Y1 = av_clip_uint8(Y1);
 765             Y2 = av_clip_uint8(Y2);
 766             U  = av_clip_uint8(U);
 767             V  = av_clip_uint8(V);
 768         }
 769         output_pixels(4*i, Y1, U, Y2, V);
 770     }
 771 }
 772
 773 static av_always_inline void
 774 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 775                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 776                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 777                      int yalpha, int uvalpha, int y,
 778                      enum PixelFormat target)
 779 {
 780     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 781                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 782                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 783     int  yalpha1 = 4095 - yalpha;
 784     int uvalpha1 = 4095 - uvalpha;
 785     int i;
 786
 787     for (i = 0; i < (dstW >> 1); i++) {
 788         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 789         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 790         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 791         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 792
 793         output_pixels(i * 4, Y1, U, Y2, V);
 794     }
 795 }
 796
 797 static av_always_inline void
 798 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 799                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 800                      const int16_t *abuf0, uint8_t *dest, int dstW,
 801                      int uvalpha, int y, enum PixelFormat target)
 802 {
 803     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 804                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 805     int i;
 806
 807     if (uvalpha < 2048) {
 808         for (i = 0; i < (dstW >> 1); i++) {
 809             int Y1 = buf0[i * 2]     >> 7;
 810             int Y2 = buf0[i * 2 + 1] >> 7;
 811             int U  = ubuf1[i]        >> 7;
 812             int V  = vbuf1[i]        >> 7;
 813
 814             output_pixels(i * 4, Y1, U, Y2, V);
 815         }
 816     } else {
 817         for (i = 0; i < (dstW >> 1); i++) {
 818             int Y1 =  buf0[i * 2]          >> 7;
 819             int Y2 =  buf0[i * 2 + 1]      >> 7;
 820             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 821             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 822
 823             output_pixels(i * 4, Y1, U, Y2, V);
 824         }
 825     }
 826 }
 827
 828 #undef output_pixels
 829
 830 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 831 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 832
 833 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 834 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 835 #define output_pixel(pos, val) \
 836     if (isBE(target)) { \
 837         AV_WB16(pos, val); \
 838     } else { \
 839         AV_WL16(pos, val); \
 840     }
 841
 842 static av_always_inline void
 843 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 844                        const int32_t **lumSrc, int lumFilterSize,
 845                        const int16_t *chrFilter, const int32_t **chrUSrc,
 846                        const int32_t **chrVSrc, int chrFilterSize,
 847                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 848                        int y, enum PixelFormat target)
 849 {
 850     int i;
 851
 852     for (i = 0; i < (dstW >> 1); i++) {
 853         int j;
 854         int Y1 = 0;
 855         int Y2 = 0;
 856         int U  = -128 << 23; // 19
 857         int V  = -128 << 23;
 858         int R, G, B;
 859
 860         for (j = 0; j < lumFilterSize; j++) {
 861             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 862             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 863         }
 864         for (j = 0; j < chrFilterSize; j++) {
 865             U += chrUSrc[j][i] * chrFilter[j];
 866             V += chrVSrc[j][i] * chrFilter[j];
 867         }
 868
 869         // 8bit: 12+15=27; 16-bit: 12+19=31
 870         Y1 >>= 14; // 10
 871         Y2 >>= 14;
 872         U  >>= 14;
 873         V  >>= 14;
 874
 875         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 876         Y1 -= c->yuv2rgb_y_offset;
 877         Y2 -= c->yuv2rgb_y_offset;
 878         Y1 *= c->yuv2rgb_y_coeff;
 879         Y2 *= c->yuv2rgb_y_coeff;
 880         Y1 += 1 << 13; // 21
 881         Y2 += 1 << 13;
 882         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 883
 884         R = V * c->yuv2rgb_v2r_coeff;
 885         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 886         B =                            U * c->yuv2rgb_u2b_coeff;
 887
 888         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 889         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 890         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 891         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 892         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 893         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 894         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 895         dest += 6;
 896     }
 897 }
 898
 899 static av_always_inline void
 900 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 901                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 902                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 903                        int yalpha, int uvalpha, int y,
 904                        enum PixelFormat target)
 905 {
 906     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 907                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 908                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 909     int  yalpha1 = 4095 - yalpha;
 910     int uvalpha1 = 4095 - uvalpha;
 911     int i;
 912
 913     for (i = 0; i < (dstW >> 1); i++) {
 914         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 915         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 916         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 917         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 918         int R, G, B;
 919
 920         Y1 -= c->yuv2rgb_y_offset;
 921         Y2 -= c->yuv2rgb_y_offset;
 922         Y1 *= c->yuv2rgb_y_coeff;
 923         Y2 *= c->yuv2rgb_y_coeff;
 924         Y1 += 1 << 13;
 925         Y2 += 1 << 13;
 926
 927         R = V * c->yuv2rgb_v2r_coeff;
 928         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 929         B =                            U * c->yuv2rgb_u2b_coeff;
 930
 931         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 932         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 933         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 934         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 935         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 936         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 937         dest += 6;
 938     }
 939 }
 940
 941 static av_always_inline void
 942 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 943                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 944                        const int32_t *abuf0, uint16_t *dest, int dstW,
 945                        int uvalpha, int y, enum PixelFormat target)
 946 {
 947     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 948                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 949     int i;
 950
 951     if (uvalpha < 2048) {
 952         for (i = 0; i < (dstW >> 1); i++) {
 953             int Y1 = (buf0[i * 2]    ) >> 2;
 954             int Y2 = (buf0[i * 2 + 1]) >> 2;
 955             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 956             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 957             int R, G, B;
 958
 959             Y1 -= c->yuv2rgb_y_offset;
 960             Y2 -= c->yuv2rgb_y_offset;
 961             Y1 *= c->yuv2rgb_y_coeff;
 962             Y2 *= c->yuv2rgb_y_coeff;
 963             Y1 += 1 << 13;
 964             Y2 += 1 << 13;
 965
 966             R = V * c->yuv2rgb_v2r_coeff;
 967             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 968             B =                            U * c->yuv2rgb_u2b_coeff;
 969
 970             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 971             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 972             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 973             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 974             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 975             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 976             dest += 6;
 977         }
 978     } else {
 979         for (i = 0; i < (dstW >> 1); i++) {
 980             int Y1 = (buf0[i * 2]    ) >> 2;
 981             int Y2 = (buf0[i * 2 + 1]) >> 2;
 982             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
 983             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
 984             int R, G, B;
 985
 986             Y1 -= c->yuv2rgb_y_offset;
 987             Y2 -= c->yuv2rgb_y_offset;
 988             Y1 *= c->yuv2rgb_y_coeff;
 989             Y2 *= c->yuv2rgb_y_coeff;
 990             Y1 += 1 << 13;
 991             Y2 += 1 << 13;
 992
 993             R = V * c->yuv2rgb_v2r_coeff;
 994             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 995             B =                            U * c->yuv2rgb_u2b_coeff;
 996
 997             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 998             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 999             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1000             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1001             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1002             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1003             dest += 6;
1004         }
1005     }
1006 }
1007
1008 #undef output_pixel
1009 #undef r_b
1010 #undef b_r
1011
1012 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
1013 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
1014 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
1015 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
1016
1017 static av_always_inline void
1018 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
1019               int U, int V, int A1, int A2,
1020               const void *_r, const void *_g, const void *_b, int y,
1021               enum PixelFormat target, int hasAlpha)
1022 {
1023     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
1024         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1025         uint32_t *dest = (uint32_t *) _dest;
1026         const uint32_t *r = (const uint32_t *) _r;
1027         const uint32_t *g = (const uint32_t *) _g;
1028         const uint32_t *b = (const uint32_t *) _b;
1029
1030 #if CONFIG_SMALL
1031         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1032
1033         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1034         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1035 #else
1036         if (hasAlpha) {
1037             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1038
1039             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1040             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1041         } else {
1042             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1043             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1044         }
1045 #endif
1046     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1047         uint8_t *dest = (uint8_t *) _dest;
1048         const uint8_t *r = (const uint8_t *) _r;
1049         const uint8_t *g = (const uint8_t *) _g;
1050         const uint8_t *b = (const uint8_t *) _b;
1051
1052 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1053 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1054
1055         dest[i * 6 + 0] = r_b[Y1];
1056         dest[i * 6 + 1] =   g[Y1];
1057         dest[i * 6 + 2] = b_r[Y1];
1058         dest[i * 6 + 3] = r_b[Y2];
1059         dest[i * 6 + 4] =   g[Y2];
1060         dest[i * 6 + 5] = b_r[Y2];
1061 #undef r_b
1062 #undef b_r
1063     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1064                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1065                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1066         uint16_t *dest = (uint16_t *) _dest;
1067         const uint16_t *r = (const uint16_t *) _r;
1068         const uint16_t *g = (const uint16_t *) _g;
1069         const uint16_t *b = (const uint16_t *) _b;
1070         int dr1, dg1, db1, dr2, dg2, db2;
1071
1072         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1073             dr1 = dither_2x2_8[ y & 1     ][0];
1074             dg1 = dither_2x2_4[ y & 1     ][0];
1075             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1076             dr2 = dither_2x2_8[ y & 1     ][1];
1077             dg2 = dither_2x2_4[ y & 1     ][1];
1078             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1079         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1080             dr1 = dither_2x2_8[ y & 1     ][0];
1081             dg1 = dither_2x2_8[ y & 1     ][1];
1082             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1083             dr2 = dither_2x2_8[ y & 1     ][1];
1084             dg2 = dither_2x2_8[ y & 1     ][0];
1085             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1086         } else {
1087             dr1 = dither_4x4_16[ y & 3     ][0];
1088             dg1 = dither_4x4_16[ y & 3     ][1];
1089             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1090             dr2 = dither_4x4_16[ y & 3     ][1];
1091             dg2 = dither_4x4_16[ y & 3     ][0];
1092             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1093         }
1094
1095         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1096         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1097     } else /* 8/4-bit */ {
1098         uint8_t *dest = (uint8_t *) _dest;
1099         const uint8_t *r = (const uint8_t *) _r;
1100         const uint8_t *g = (const uint8_t *) _g;
1101         const uint8_t *b = (const uint8_t *) _b;
1102         int dr1, dg1, db1, dr2, dg2, db2;
1103
1104         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1105             const uint8_t * const d64 = dither_8x8_73[y & 7];
1106             const uint8_t * const d32 = dither_8x8_32[y & 7];
1107             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1108             db1 =       d64[(i * 2 + 0) & 7];
1109             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1110             db2 =       d64[(i * 2 + 1) & 7];
1111         } else {
1112             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1113             const uint8_t * const d128 = dither_8x8_220[y & 7];
1114             dr1 = db1 = d128[(i * 2 + 0) & 7];
1115             dg1 =        d64[(i * 2 + 0) & 7];
1116             dr2 = db2 = d128[(i * 2 + 1) & 7];
1117             dg2 =        d64[(i * 2 + 1) & 7];
1118         }
1119
1120         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1121             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1122                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1123         } else {
1124             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1125             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1126         }
1127     }
1128 }
1129
1130 static av_always_inline void
1131 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1132                      const int16_t **lumSrc, int lumFilterSize,
1133                      const int16_t *chrFilter, const int16_t **chrUSrc,
1134                      const int16_t **chrVSrc, int chrFilterSize,
1135                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1136                      int y, enum PixelFormat target, int hasAlpha)
1137 {
1138     int i;
1139
1140     for (i = 0; i < (dstW >> 1); i++) {
1141         int j;
1142         int Y1 = 1 << 18;
1143         int Y2 = 1 << 18;
1144         int U  = 1 << 18;
1145         int V  = 1 << 18;
1146         int av_unused A1, A2;
1147         const void *r, *g, *b;
1148
1149         for (j = 0; j < lumFilterSize; j++) {
1150             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1151             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1152         }
1153         for (j = 0; j < chrFilterSize; j++) {
1154             U += chrUSrc[j][i] * chrFilter[j];
1155             V += chrVSrc[j][i] * chrFilter[j];
1156         }
1157         Y1 >>= 19;
1158         Y2 >>= 19;
1159         U  >>= 19;
1160         V  >>= 19;
1161         if ((Y1 | Y2 | U | V) & 0x100) {
1162             Y1 = av_clip_uint8(Y1);
1163             Y2 = av_clip_uint8(Y2);
1164             U  = av_clip_uint8(U);
1165             V  = av_clip_uint8(V);
1166         }
1167         if (hasAlpha) {
1168             A1 = 1 << 18;
1169             A2 = 1 << 18;
1170             for (j = 0; j < lumFilterSize; j++) {
1171                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1172                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1173             }
1174             A1 >>= 19;
1175             A2 >>= 19;
1176             if ((A1 | A2) & 0x100) {
1177                 A1 = av_clip_uint8(A1);
1178                 A2 = av_clip_uint8(A2);
1179             }
1180         }
1181
1182         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1183         r =  c->table_rV[V];
1184         g = (c->table_gU[U] + c->table_gV[V]);
1185         b =  c->table_bU[U];
1186
1187         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1188                       r, g, b, y, target, hasAlpha);
1189     }
1190 }
1191
1192 static av_always_inline void
1193 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1194                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1195                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1196                      int yalpha, int uvalpha, int y,
1197                      enum PixelFormat target, int hasAlpha)
1198 {
1199     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1200                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1201                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1202                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1203                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1204     int  yalpha1 = 4095 - yalpha;
1205     int uvalpha1 = 4095 - uvalpha;
1206     int i;
1207
1208     for (i = 0; i < (dstW >> 1); i++) {
1209         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1210         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1211         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1212         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1213         int A1, A2;
1214         const void *r =  c->table_rV[V],
1215                    *g = (c->table_gU[U] + c->table_gV[V]),
1216                    *b =  c->table_bU[U];
1217
1218         if (hasAlpha) {
1219             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1220             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1221         }
1222
1223         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1224                       r, g, b, y, target, hasAlpha);
1225     }
1226 }
1227
1228 static av_always_inline void
1229 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1230                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1231                      const int16_t *abuf0, uint8_t *dest, int dstW,
1232                      int uvalpha, int y, enum PixelFormat target,
1233                      int hasAlpha)
1234 {
1235     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1236                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1237     int i;
1238
1239     if (uvalpha < 2048) {
1240         for (i = 0; i < (dstW >> 1); i++) {
1241             int Y1 = buf0[i * 2]     >> 7;
1242             int Y2 = buf0[i * 2 + 1] >> 7;
1243             int U  = ubuf1[i]        >> 7;
1244             int V  = vbuf1[i]        >> 7;
1245             int A1, A2;
1246             const void *r =  c->table_rV[V],
1247                        *g = (c->table_gU[U] + c->table_gV[V]),
1248                        *b =  c->table_bU[U];
1249
1250             if (hasAlpha) {
1251                 A1 = abuf0[i * 2    ] >> 7;
1252                 A2 = abuf0[i * 2 + 1] >> 7;
1253             }
1254
1255             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1256                           r, g, b, y, target, hasAlpha);
1257         }
1258     } else {
1259         for (i = 0; i < (dstW >> 1); i++) {
1260             int Y1 =  buf0[i * 2]          >> 7;
1261             int Y2 =  buf0[i * 2 + 1]      >> 7;
1262             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1263             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1264             int A1, A2;
1265             const void *r =  c->table_rV[V],
1266                        *g = (c->table_gU[U] + c->table_gV[V]),
1267                        *b =  c->table_bU[U];
1268
1269             if (hasAlpha) {
1270                 A1 = abuf0[i * 2    ] >> 7;
1271                 A2 = abuf0[i * 2 + 1] >> 7;
1272             }
1273
1274             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1275                           r, g, b, y, target, hasAlpha);
1276         }
1277     }
1278 }
1279
1280 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1281 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1282                                 const int16_t **lumSrc, int lumFilterSize, \
1283                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1284                                 const int16_t **chrVSrc, int chrFilterSize, \
1285                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1286                                 int y) \
1287 { \
1288     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1289                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1290                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1291 }
1292 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1293 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1294 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1295                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1296                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1297                                 int yalpha, int uvalpha, int y) \
1298 { \
1299     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1300                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1301 } \
1302  \
1303 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1304                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1305                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1306                                 int uvalpha, int y) \
1307 { \
1308     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1309                                   dstW, uvalpha, y, fmt, hasAlpha); \
1310 }
1311
1312 #if CONFIG_SMALL
1313 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1314 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1315 #else
1316 #if CONFIG_SWSCALE_ALPHA
1317 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1318 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1319 #endif
1320 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1321 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1322 #endif
1323 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1324 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1325 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1326 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1327 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1328 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1329 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1330 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1331
1332 static av_always_inline void
1333 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1334                           const int16_t **lumSrc, int lumFilterSize,
1335                           const int16_t *chrFilter, const int16_t **chrUSrc,
1336                           const int16_t **chrVSrc, int chrFilterSize,
1337                           const int16_t **alpSrc, uint8_t *dest,
1338                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1339 {
1340     int i;
1341     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1342
1343     for (i = 0; i < dstW; i++) {
1344         int j;
1345         int Y = 1<<9;
1346         int U = (1<<9)-(128 << 19);
1347         int V = (1<<9)-(128 << 19);
1348         int av_unused A;
1349         int R, G, B;
1350
1351         for (j = 0; j < lumFilterSize; j++) {
1352             Y += lumSrc[j][i] * lumFilter[j];
1353         }
1354         for (j = 0; j < chrFilterSize; j++) {
1355             U += chrUSrc[j][i] * chrFilter[j];
1356             V += chrVSrc[j][i] * chrFilter[j];
1357         }
1358         Y >>= 10;
1359         U >>= 10;
1360         V >>= 10;
1361         if (hasAlpha) {
1362             A = 1 << 18;
1363             for (j = 0; j < lumFilterSize; j++) {
1364                 A += alpSrc[j][i] * lumFilter[j];
1365             }
1366             A >>= 19;
1367             if (A & 0x100)
1368                 A = av_clip_uint8(A);
1369         }
1370         Y -= c->yuv2rgb_y_offset;
1371         Y *= c->yuv2rgb_y_coeff;
1372         Y += 1 << 21;
1373         R = Y + V*c->yuv2rgb_v2r_coeff;
1374         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1375         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1376         if ((R | G | B) & 0xC0000000) {
1377             R = av_clip_uintp2(R, 30);
1378             G = av_clip_uintp2(G, 30);
1379             B = av_clip_uintp2(B, 30);
1380         }
1381
1382         switch(target) {
1383         case PIX_FMT_ARGB:
1384             dest[0] = hasAlpha ? A : 255;
1385             dest[1] = R >> 22;
1386             dest[2] = G >> 22;
1387             dest[3] = B >> 22;
1388             break;
1389         case PIX_FMT_RGB24:
1390             dest[0] = R >> 22;
1391             dest[1] = G >> 22;
1392             dest[2] = B >> 22;
1393             break;
1394         case PIX_FMT_RGBA:
1395             dest[0] = R >> 22;
1396             dest[1] = G >> 22;
1397             dest[2] = B >> 22;
1398             dest[3] = hasAlpha ? A : 255;
1399             break;
1400         case PIX_FMT_ABGR:
1401             dest[0] = hasAlpha ? A : 255;
1402             dest[1] = B >> 22;
1403             dest[2] = G >> 22;
1404             dest[3] = R >> 22;
1405             break;
1406         case PIX_FMT_BGR24:
1407             dest[0] = B >> 22;
1408             dest[1] = G >> 22;
1409             dest[2] = R >> 22;
1410             break;
1411         case PIX_FMT_BGRA:
1412             dest[0] = B >> 22;
1413             dest[1] = G >> 22;
1414             dest[2] = R >> 22;
1415             dest[3] = hasAlpha ? A : 255;
1416             break;
1417         }
1418         dest += step;
1419     }
1420 }
1421
1422 #if CONFIG_SMALL
1423 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1424 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1425 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1426 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1427 #else
1428 #if CONFIG_SWSCALE_ALPHA
1429 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1430 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1431 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1432 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1433 #endif
1434 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1435 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1436 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1437 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1438 #endif
1439 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1440 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1441
1442 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1443                                        int width, int height,
1444                                        int y, uint8_t val)
1445 {
1446     int i;
1447     uint8_t *ptr = plane + stride*y;
1448     for (i=0; i<height; i++) {
1449         memset(ptr, val, width);
1450         ptr += stride;
1451     }
1452 }
1453
1454 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1455
1456 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1457 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1458
1459 static av_always_inline void
1460 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1461                     enum PixelFormat origin)
1462 {
1463     int i;
1464     for (i = 0; i < width; i++) {
1465         unsigned int r_b = input_pixel(&src[i*3+0]);
1466         unsigned int   g = input_pixel(&src[i*3+1]);
1467         unsigned int b_r = input_pixel(&src[i*3+2]);
1468
1469         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1470     }
1471 }
1472
1473 static av_always_inline void
1474 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1475                     const uint16_t *src1, const uint16_t *src2,
1476                     int width, enum PixelFormat origin)
1477 {
1478     int i;
1479     assert(src1==src2);
1480     for (i = 0; i < width; i++) {
1481         int r_b = input_pixel(&src1[i*3+0]);
1482         int   g = input_pixel(&src1[i*3+1]);
1483         int b_r = input_pixel(&src1[i*3+2]);
1484
1485         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1486         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1487     }
1488 }
1489
1490 static av_always_inline void
1491 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1492                           const uint16_t *src1, const uint16_t *src2,
1493                           int width, enum PixelFormat origin)
1494 {
1495     int i;
1496     assert(src1==src2);
1497     for (i = 0; i < width; i++) {
1498         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1499         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1500         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1501
1502         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1503         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1504     }
1505 }
1506
1507 #undef r
1508 #undef b
1509 #undef input_pixel
1510
1511 #define rgb48funcs(pattern, BE_LE, origin) \
1512 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
1513                                     int width, uint32_t *unused) \
1514 { \
1515     const uint16_t *src = (const uint16_t *) _src; \
1516     uint16_t *dst = (uint16_t *) _dst; \
1517     rgb48ToY_c_template(dst, src, width, origin); \
1518 } \
1519  \
1520 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1521                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1522                                     int width, uint32_t *unused) \
1523 { \
1524     const uint16_t *src1 = (const uint16_t *) _src1, \
1525                    *src2 = (const uint16_t *) _src2; \
1526     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1527     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1528 } \
1529  \
1530 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1531                                     const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
1532                                     int width, uint32_t *unused) \
1533 { \
1534     const uint16_t *src1 = (const uint16_t *) _src1, \
1535                    *src2 = (const uint16_t *) _src2; \
1536     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1537     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1538 }
1539
1540 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1541 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1542 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1543 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1544
1545 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1546                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1547                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1548
1549 static av_always_inline void
1550 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1551                        int width, enum PixelFormat origin,
1552                        int shr,   int shg,   int shb, int shp,
1553                        int maskr, int maskg, int maskb,
1554                        int rsh,   int gsh,   int bsh, int S)
1555 {
1556     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1557               rnd = (32<<((S)-1)) + (1<<(S-7));
1558     int i;
1559
1560     for (i = 0; i < width; i++) {
1561         int px = input_pixel(i) >> shp;
1562         int b = (px & maskb) >> shb;
1563         int g = (px & maskg) >> shg;
1564         int r = (px & maskr) >> shr;
1565
1566         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1567     }
1568 }
1569
1570 static av_always_inline void
1571 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1572                         const uint8_t *src, int width,
1573                         enum PixelFormat origin,
1574                         int shr,   int shg,   int shb, int shp,
1575                         int maskr, int maskg, int maskb,
1576                         int rsh,   int gsh,   int bsh, int S)
1577 {
1578     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1579               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1580               rnd = (256<<((S)-1)) + (1<<(S-7));
1581     int i;
1582
1583     for (i = 0; i < width; i++) {
1584         int px = input_pixel(i) >> shp;
1585         int b = (px & maskb) >> shb;
1586         int g = (px & maskg) >> shg;
1587         int r = (px & maskr) >> shr;
1588
1589         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1590         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1591     }
1592 }
1593
1594 static av_always_inline void
1595 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1596                              const uint8_t *src, int width,
1597                              enum PixelFormat origin,
1598                              int shr,   int shg,   int shb, int shp,
1599                              int maskr, int maskg, int maskb,
1600                              int rsh,   int gsh,   int bsh, int S)
1601 {
1602     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1603               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1604               rnd = (256U<<(S)) + (1<<(S-6)), maskgx = ~(maskr | maskb);
1605     int i;
1606
1607     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1608     for (i = 0; i < width; i++) {
1609         int px0 = input_pixel(2 * i + 0) >> shp;
1610         int px1 = input_pixel(2 * i + 1) >> shp;
1611         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1612         int rb = px0 + px1 - g;
1613
1614         b = (rb & maskb) >> shb;
1615         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1616             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1617             g >>= shg;
1618         } else {
1619             g = (g  & maskg) >> shg;
1620         }
1621         r = (rb & maskr) >> shr;
1622
1623         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1624         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1625     }
1626 }
1627
1628 #undef input_pixel
1629
1630 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1631                          maskg, maskb, rsh, gsh, bsh, S) \
1632 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
1633                           int width, uint32_t *unused) \
1634 { \
1635     rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, \
1636                            shr, shg, shb, shp, \
1637                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1638 } \
1639  \
1640 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1641                            const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1642                            int width, uint32_t *unused) \
1643 { \
1644     rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,  \
1645                             shr, shg, shb, shp, \
1646                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1647 } \
1648  \
1649 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1650                                 const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
1651                                 int width, uint32_t *unused) \
1652 { \
1653     rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
1654                                  shr, shg, shb, shp, \
1655                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1656 }
1657
1658 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1659 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1660 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1661 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1662 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1663 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1664 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1665 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1666 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1667 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1668 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1669 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1670
1671 static void gbr24pToY_c(uint16_t *dst, const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
1672                         int width, uint32_t *unused)
1673 {
1674     int i;
1675     for (i = 0; i < width; i++) {
1676         unsigned int g   = gsrc[i];
1677         unsigned int b   = bsrc[i];
1678         unsigned int r   = rsrc[i];
1679
1680         dst[i] = (RY*r + GY*g + BY*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1681     }
1682 }
1683
1684 static void gbr24pToUV_c(uint16_t *dstU, uint16_t *dstV,
1685                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
1686                          int width, enum PixelFormat origin)
1687 {
1688     int i;
1689     for (i = 0; i < width; i++) {
1690         unsigned int g   = gsrc[i];
1691         unsigned int b   = bsrc[i];
1692         unsigned int r   = rsrc[i];
1693
1694         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1695         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
1696     }
1697 }
1698
1699 static void gbr24pToUV_half_c(uint16_t *dstU, uint16_t *dstV,
1700                          const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
1701                          int width, enum PixelFormat origin)
1702 {
1703     int i;
1704     for (i = 0; i < width; i++) {
1705         unsigned int g   = gsrc[2*i] + gsrc[2*i+1];
1706         unsigned int b   = bsrc[2*i] + bsrc[2*i+1];
1707         unsigned int r   = rsrc[2*i] + rsrc[2*i+1];
1708
1709         dstU[i] = (RU*r + GU*g + BU*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1710         dstV[i] = (RV*r + GV*g + BV*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
1711     }
1712 }
1713
1714 static void abgrToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1715 {
1716     int i;
1717     for (i=0; i<width; i++) {
1718         dst[i]= src[4*i]<<6;
1719     }
1720 }
1721
1722 static void rgbaToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
1723 {
1724     int i;
1725     for (i=0; i<width; i++) {
1726         dst[i]= src[4*i+3]<<6;
1727     }
1728 }
1729
1730 static void palToA_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
1731 {
1732     int i;
1733     for (i=0; i<width; i++) {
1734         int d= src[i];
1735
1736         dst[i]= (pal[d] >> 24)<<6;
1737     }
1738 }
1739
1740 static void palToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, long width, uint32_t *pal)
1741 {
1742     int i;
1743     for (i=0; i<width; i++) {
1744         int d= src[i];
1745
1746         dst[i]= (pal[d] & 0xFF)<<6;
1747     }
1748 }
1749
1750 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1751                            const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1752                            int width, uint32_t *pal)
1753 {
1754     int i;
1755     assert(src1 == src2);
1756     for (i=0; i<width; i++) {
1757         int p= pal[src1[i]];
1758
1759         dstU[i]= (uint8_t)(p>> 8)<<6;
1760         dstV[i]= (uint8_t)(p>>16)<<6;
1761     }
1762 }
1763
1764 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1765 {
1766     int i, j;
1767     for (i=0; i<width/8; i++) {
1768         int d= ~src[i];
1769         for(j=0; j<8; j++)
1770             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1771     }
1772     if(width&7){
1773         int d= ~src[i];
1774         for(j=0; j<(width&7); j++)
1775             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1776     }
1777 }
1778
1779 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1780 {
1781     int i, j;
1782     for (i=0; i<width/8; i++) {
1783         int d= src[i];
1784         for(j=0; j<8; j++)
1785             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1786     }
1787     if(width&7){
1788         int d= src[i];
1789         for(j=0; j<(width&7); j++)
1790             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1791     }
1792 }
1793
1794 //FIXME yuy2* can read up to 7 samples too much
1795
1796 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1797                       uint32_t *unused)
1798 {
1799     int i;
1800     for (i=0; i<width; i++)
1801         dst[i]= src[2*i];
1802 }
1803
1804 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1805                        const uint8_t *src2, int width, uint32_t *unused)
1806 {
1807     int i;
1808     for (i=0; i<width; i++) {
1809         dstU[i]= src1[4*i + 1];
1810         dstV[i]= src1[4*i + 3];
1811     }
1812     assert(src1 == src2);
1813 }
1814
1815 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2,  int width, uint32_t *unused)
1816 {
1817     int i;
1818     const uint16_t *src = (const uint16_t *) _src;
1819     uint16_t *dst = (uint16_t *) _dst;
1820     for (i=0; i<width; i++) {
1821         dst[i] = av_bswap16(src[i]);
1822     }
1823 }
1824
1825 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
1826                         const uint8_t *_src2, int width, uint32_t *unused)
1827 {
1828     int i;
1829     const uint16_t *src1 = (const uint16_t *) _src1,
1830                    *src2 = (const uint16_t *) _src2;
1831     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1832     for (i=0; i<width; i++) {
1833         dstU[i] = av_bswap16(src1[i]);
1834         dstV[i] = av_bswap16(src2[i]);
1835     }
1836 }
1837
1838 /* This is almost identical to the previous, end exists only because
1839  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1840 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,  int width,
1841                       uint32_t *unused)
1842 {
1843     int i;
1844     for (i=0; i<width; i++)
1845         dst[i]= src[2*i+1];
1846 }
1847
1848 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1849                        const uint8_t *src2, int width, uint32_t *unused)
1850 {
1851     int i;
1852     for (i=0; i<width; i++) {
1853         dstU[i]= src1[4*i + 0];
1854         dstV[i]= src1[4*i + 2];
1855     }
1856     assert(src1 == src2);
1857 }
1858
1859 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1860                                         const uint8_t *src, int width)
1861 {
1862     int i;
1863     for (i = 0; i < width; i++) {
1864         dst1[i] = src[2*i+0];
1865         dst2[i] = src[2*i+1];
1866     }
1867 }
1868
1869 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1870                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1871                        int width, uint32_t *unused)
1872 {
1873     nvXXtoUV_c(dstU, dstV, src1, width);
1874 }
1875
1876 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1877                        const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
1878                        int width, uint32_t *unused)
1879 {
1880     nvXXtoUV_c(dstV, dstU, src1, width);
1881 }
1882
1883 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1884
1885 static void bgr24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
1886                        int width, uint32_t *unused)
1887 {
1888     int i;
1889     for (i=0; i<width; i++) {
1890         int b= src[i*3+0];
1891         int g= src[i*3+1];
1892         int r= src[i*3+2];
1893
1894         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1895     }
1896 }
1897
1898 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1899                         const uint8_t *src2, int width, uint32_t *unused)
1900 {
1901     int i;
1902     for (i=0; i<width; i++) {
1903         int b= src1[3*i + 0];
1904         int g= src1[3*i + 1];
1905         int r= src1[3*i + 2];
1906
1907         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1908         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1909     }
1910     assert(src1 == src2);
1911 }
1912
1913 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1914                              const uint8_t *src2, int width, uint32_t *unused)
1915 {
1916     int i;
1917     for (i=0; i<width; i++) {
1918         int b= src1[6*i + 0] + src1[6*i + 3];
1919         int g= src1[6*i + 1] + src1[6*i + 4];
1920         int r= src1[6*i + 2] + src1[6*i + 5];
1921
1922         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1923         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1924     }
1925     assert(src1 == src2);
1926 }
1927
1928 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
1929                        uint32_t *unused)
1930 {
1931     int i;
1932     for (i=0; i<width; i++) {
1933         int r= src[i*3+0];
1934         int g= src[i*3+1];
1935         int b= src[i*3+2];
1936
1937         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1938     }
1939 }
1940
1941 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1942                         const uint8_t *src2, int width, uint32_t *unused)
1943 {
1944     int i;
1945     assert(src1==src2);
1946     for (i=0; i<width; i++) {
1947         int r= src1[3*i + 0];
1948         int g= src1[3*i + 1];
1949         int b= src1[3*i + 2];
1950
1951         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1952         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1953     }
1954 }
1955
1956 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *unused0, const uint8_t *src1,
1957                                     const uint8_t *src2, int width, uint32_t *unused)
1958 {
1959     int i;
1960     assert(src1==src2);
1961     for (i=0; i<width; i++) {
1962         int r= src1[6*i + 0] + src1[6*i + 3];
1963         int g= src1[6*i + 1] + src1[6*i + 4];
1964         int b= src1[6*i + 2] + src1[6*i + 5];
1965
1966         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1967         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1968     }
1969 }
1970
1971 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
1972                            const int16_t *filter,
1973                            const int16_t *filterPos, int filterSize)
1974 {
1975     int i;
1976     int32_t *dst = (int32_t *) _dst;
1977     const uint16_t *src = (const uint16_t *) _src;
1978     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1979     int sh = bits - 4;
1980
1981     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
1982         sh= 9;
1983
1984     for (i = 0; i < dstW; i++) {
1985         int j;
1986         int srcPos = filterPos[i];
1987         int val = 0;
1988
1989         for (j = 0; j < filterSize; j++) {
1990             val += src[srcPos + j] * filter[filterSize * i + j];
1991         }
1992         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1993         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1994     }
1995 }
1996
1997 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
1998                            const int16_t *filter,
1999                            const int16_t *filterPos, int filterSize)
2000 {
2001     int i;
2002     const uint16_t *src = (const uint16_t *) _src;
2003     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2004
2005     if(sh<15)
2006         sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2007
2008     for (i = 0; i < dstW; i++) {
2009         int j;
2010         int srcPos = filterPos[i];
2011         int val = 0;
2012
2013         for (j = 0; j < filterSize; j++) {
2014             val += src[srcPos + j] * filter[filterSize * i + j];
2015         }
2016         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2017         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2018     }
2019 }
2020
2021 // bilinear / bicubic scaling
2022 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2023                           const int16_t *filter, const int16_t *filterPos,
2024                           int filterSize)
2025 {
2026     int i;
2027     for (i=0; i<dstW; i++) {
2028         int j;
2029         int srcPos= filterPos[i];
2030         int val=0;
2031         for (j=0; j<filterSize; j++) {
2032             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2033         }
2034         //filter += hFilterSize;
2035         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2036         //dst[i] = val>>7;
2037     }
2038 }
2039
2040 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
2041                           const int16_t *filter, const int16_t *filterPos,
2042                           int filterSize)
2043 {
2044     int i;
2045     int32_t *dst = (int32_t *) _dst;
2046     for (i=0; i<dstW; i++) {
2047         int j;
2048         int srcPos= filterPos[i];
2049         int val=0;
2050         for (j=0; j<filterSize; j++) {
2051             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2052         }
2053         //filter += hFilterSize;
2054         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
2055         //dst[i] = val>>7;
2056     }
2057 }
2058
2059 //FIXME all pal and rgb srcFormats could do this convertion as well
2060 //FIXME all scalers more complex than bilinear could do half of this transform
2061 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2062 {
2063     int i;
2064     for (i = 0; i < width; i++) {
2065         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2066         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2067     }
2068 }
2069 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2070 {
2071     int i;
2072     for (i = 0; i < width; i++) {
2073         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2074         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2075     }
2076 }
2077 static void lumRangeToJpeg_c(int16_t *dst, int width)
2078 {
2079     int i;
2080     for (i = 0; i < width; i++)
2081         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2082 }
2083 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2084 {
2085     int i;
2086     for (i = 0; i < width; i++)
2087         dst[i] = (dst[i]*14071 + 33561947)>>14;
2088 }
2089
2090 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2091 {
2092     int i;
2093     int32_t *dstU = (int32_t *) _dstU;
2094     int32_t *dstV = (int32_t *) _dstV;
2095     for (i = 0; i < width; i++) {
2096         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2097         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2098     }
2099 }
2100 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2101 {
2102     int i;
2103     int32_t *dstU = (int32_t *) _dstU;
2104     int32_t *dstV = (int32_t *) _dstV;
2105     for (i = 0; i < width; i++) {
2106         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2107         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2108     }
2109 }
2110 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2111 {
2112     int i;
2113     int32_t *dst = (int32_t *) _dst;
2114     for (i = 0; i < width; i++)
2115         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2116 }
2117 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2118 {
2119     int i;
2120     int32_t *dst = (int32_t *) _dst;
2121     for (i = 0; i < width; i++)
2122         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2123 }
2124
2125 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2126                            const uint8_t *src, int srcW, int xInc)
2127 {
2128     int i;
2129     unsigned int xpos=0;
2130     for (i=0;i<dstWidth;i++) {
2131         register unsigned int xx=xpos>>16;
2132         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2133         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2134         xpos+=xInc;
2135     }
2136     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2137         dst[i] = src[srcW-1]*128;
2138 }
2139
2140 // *** horizontal scale Y line to temp buffer
2141 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2142                                      const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
2143                                      int srcW, int xInc,
2144                                      const int16_t *hLumFilter,
2145                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2146                                      uint8_t *formatConvBuffer,
2147                                      uint32_t *pal, int isAlpha)
2148 {
2149     void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2150     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2151
2152     if (toYV12) {
2153         toYV12(formatConvBuffer, src, src2, src3, srcW, pal);
2154         src= formatConvBuffer;
2155     }
2156
2157     if (!c->hyscale_fast) {
2158         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2159     } else { // fast bilinear upscale / crap downscale
2160         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2161     }
2162
2163     if (convertRange)
2164         convertRange(dst, dstWidth);
2165 }
2166
2167 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2168                            int dstWidth, const uint8_t *src1,
2169                            const uint8_t *src2, int srcW, int xInc)
2170 {
2171     int i;
2172     unsigned int xpos=0;
2173     for (i=0;i<dstWidth;i++) {
2174         register unsigned int xx=xpos>>16;
2175         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2176         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2177         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2178         xpos+=xInc;
2179     }
2180     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2181         dst1[i] = src1[srcW-1]*128;
2182         dst2[i] = src2[srcW-1]*128;
2183     }
2184 }
2185
2186 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2187                                      const uint8_t *src0, const uint8_t *src1, const uint8_t *src2,
2188                                      int srcW, int xInc, const int16_t *hChrFilter,
2189                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2190                                      uint8_t *formatConvBuffer, uint32_t *pal)
2191 {
2192     if (c->chrToYV12) {
2193         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2194         c->chrToYV12(formatConvBuffer, buf2, src0, src1, src2, srcW, pal);
2195         src1= formatConvBuffer;
2196         src2= buf2;
2197     }
2198
2199     if (!c->hcscale_fast) {
2200         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2201         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2202     } else { // fast bilinear upscale / crap downscale
2203         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2204     }
2205
2206     if (c->chrConvertRange)
2207         c->chrConvertRange(dst1, dst2, dstWidth);
2208 }
2209
2210 static av_always_inline void
2211 find_c_packed_planar_out_funcs(SwsContext *c,
2212                                yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX,
2213                                yuv2interleavedX_fn *yuv2nv12cX,
2214                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2215                                yuv2packedX_fn *yuv2packedX)
2216 {
2217     enum PixelFormat dstFormat = c->dstFormat;
2218
2219     if (is16BPS(dstFormat)) {
2220         *yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c  : yuv2planeX_16LE_c;
2221         *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c  : yuv2plane1_16LE_c;
2222     } else if (is9_OR_10BPS(dstFormat)) {
2223         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2224             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c  : yuv2planeX_9LE_c;
2225             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c  : yuv2plane1_9LE_c;
2226         } else {
2227             *yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c  : yuv2planeX_10LE_c;
2228             *yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c  : yuv2plane1_10LE_c;
2229         }
2230     } else {
2231         *yuv2plane1 = yuv2plane1_8_c;
2232         *yuv2planeX = yuv2planeX_8_c;
2233         if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21)
2234             *yuv2nv12cX = yuv2nv12cX_c;
2235     }
2236
2237     if(c->flags & SWS_FULL_CHR_H_INT) {
2238         switch (dstFormat) {
2239             case PIX_FMT_RGBA:
2240 #if CONFIG_SMALL
2241                 *yuv2packedX = yuv2rgba32_full_X_c;
2242 #else
2243 #if CONFIG_SWSCALE_ALPHA
2244                 if (c->alpPixBuf) {
2245                     *yuv2packedX = yuv2rgba32_full_X_c;
2246                 } else
2247 #endif /* CONFIG_SWSCALE_ALPHA */
2248                 {
2249                     *yuv2packedX = yuv2rgbx32_full_X_c;
2250                 }
2251 #endif /* !CONFIG_SMALL */
2252                 break;
2253             case PIX_FMT_ARGB:
2254 #if CONFIG_SMALL
2255                 *yuv2packedX = yuv2argb32_full_X_c;
2256 #else
2257 #if CONFIG_SWSCALE_ALPHA
2258                 if (c->alpPixBuf) {
2259                     *yuv2packedX = yuv2argb32_full_X_c;
2260                 } else
2261 #endif /* CONFIG_SWSCALE_ALPHA */
2262                 {
2263                     *yuv2packedX = yuv2xrgb32_full_X_c;
2264                 }
2265 #endif /* !CONFIG_SMALL */
2266                 break;
2267             case PIX_FMT_BGRA:
2268 #if CONFIG_SMALL
2269                 *yuv2packedX = yuv2bgra32_full_X_c;
2270 #else
2271 #if CONFIG_SWSCALE_ALPHA
2272                 if (c->alpPixBuf) {
2273                     *yuv2packedX = yuv2bgra32_full_X_c;
2274                 } else
2275 #endif /* CONFIG_SWSCALE_ALPHA */
2276                 {
2277                     *yuv2packedX = yuv2bgrx32_full_X_c;
2278                 }
2279 #endif /* !CONFIG_SMALL */
2280                 break;
2281             case PIX_FMT_ABGR:
2282 #if CONFIG_SMALL
2283                 *yuv2packedX = yuv2abgr32_full_X_c;
2284 #else
2285 #if CONFIG_SWSCALE_ALPHA
2286                 if (c->alpPixBuf) {
2287                     *yuv2packedX = yuv2abgr32_full_X_c;
2288                 } else
2289 #endif /* CONFIG_SWSCALE_ALPHA */
2290                 {
2291                     *yuv2packedX = yuv2xbgr32_full_X_c;
2292                 }
2293 #endif /* !CONFIG_SMALL */
2294                 break;
2295             case PIX_FMT_RGB24:
2296             *yuv2packedX = yuv2rgb24_full_X_c;
2297             break;
2298         case PIX_FMT_BGR24:
2299             *yuv2packedX = yuv2bgr24_full_X_c;
2300             break;
2301         }
2302         if(!*yuv2packedX)
2303             goto YUV_PACKED;
2304     } else {
2305         YUV_PACKED:
2306         switch (dstFormat) {
2307         case PIX_FMT_GRAY16BE:
2308             *yuv2packed1 = yuv2gray16BE_1_c;
2309             *yuv2packed2 = yuv2gray16BE_2_c;
2310             *yuv2packedX = yuv2gray16BE_X_c;
2311             break;
2312         case PIX_FMT_GRAY16LE:
2313             *yuv2packed1 = yuv2gray16LE_1_c;
2314             *yuv2packed2 = yuv2gray16LE_2_c;
2315             *yuv2packedX = yuv2gray16LE_X_c;
2316             break;
2317         case PIX_FMT_MONOWHITE:
2318             *yuv2packed1 = yuv2monowhite_1_c;
2319             *yuv2packed2 = yuv2monowhite_2_c;
2320             *yuv2packedX = yuv2monowhite_X_c;
2321             break;
2322         case PIX_FMT_MONOBLACK:
2323             *yuv2packed1 = yuv2monoblack_1_c;
2324             *yuv2packed2 = yuv2monoblack_2_c;
2325             *yuv2packedX = yuv2monoblack_X_c;
2326             break;
2327         case PIX_FMT_YUYV422:
2328             *yuv2packed1 = yuv2yuyv422_1_c;
2329             *yuv2packed2 = yuv2yuyv422_2_c;
2330             *yuv2packedX = yuv2yuyv422_X_c;
2331             break;
2332         case PIX_FMT_UYVY422:
2333             *yuv2packed1 = yuv2uyvy422_1_c;
2334             *yuv2packed2 = yuv2uyvy422_2_c;
2335             *yuv2packedX = yuv2uyvy422_X_c;
2336             break;
2337         case PIX_FMT_RGB48LE:
2338             *yuv2packed1 = yuv2rgb48le_1_c;
2339             *yuv2packed2 = yuv2rgb48le_2_c;
2340             *yuv2packedX = yuv2rgb48le_X_c;
2341             break;
2342         case PIX_FMT_RGB48BE:
2343             *yuv2packed1 = yuv2rgb48be_1_c;
2344             *yuv2packed2 = yuv2rgb48be_2_c;
2345             *yuv2packedX = yuv2rgb48be_X_c;
2346             break;
2347         case PIX_FMT_BGR48LE:
2348             *yuv2packed1 = yuv2bgr48le_1_c;
2349             *yuv2packed2 = yuv2bgr48le_2_c;
2350             *yuv2packedX = yuv2bgr48le_X_c;
2351             break;
2352         case PIX_FMT_BGR48BE:
2353             *yuv2packed1 = yuv2bgr48be_1_c;
2354             *yuv2packed2 = yuv2bgr48be_2_c;
2355             *yuv2packedX = yuv2bgr48be_X_c;
2356             break;
2357         case PIX_FMT_RGB32:
2358         case PIX_FMT_BGR32:
2359 #if CONFIG_SMALL
2360             *yuv2packed1 = yuv2rgb32_1_c;
2361             *yuv2packed2 = yuv2rgb32_2_c;
2362             *yuv2packedX = yuv2rgb32_X_c;
2363 #else
2364 #if CONFIG_SWSCALE_ALPHA
2365                 if (c->alpPixBuf) {
2366                     *yuv2packed1 = yuv2rgba32_1_c;
2367                     *yuv2packed2 = yuv2rgba32_2_c;
2368                     *yuv2packedX = yuv2rgba32_X_c;
2369                 } else
2370 #endif /* CONFIG_SWSCALE_ALPHA */
2371                 {
2372                     *yuv2packed1 = yuv2rgbx32_1_c;
2373                     *yuv2packed2 = yuv2rgbx32_2_c;
2374                     *yuv2packedX = yuv2rgbx32_X_c;
2375                 }
2376 #endif /* !CONFIG_SMALL */
2377             break;
2378         case PIX_FMT_RGB32_1:
2379         case PIX_FMT_BGR32_1:
2380 #if CONFIG_SMALL
2381                 *yuv2packed1 = yuv2rgb32_1_1_c;
2382                 *yuv2packed2 = yuv2rgb32_1_2_c;
2383                 *yuv2packedX = yuv2rgb32_1_X_c;
2384 #else
2385 #if CONFIG_SWSCALE_ALPHA
2386                 if (c->alpPixBuf) {
2387                     *yuv2packed1 = yuv2rgba32_1_1_c;
2388                     *yuv2packed2 = yuv2rgba32_1_2_c;
2389                     *yuv2packedX = yuv2rgba32_1_X_c;
2390                 } else
2391 #endif /* CONFIG_SWSCALE_ALPHA */
2392                 {
2393                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2394                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2395                     *yuv2packedX = yuv2rgbx32_1_X_c;
2396                 }
2397 #endif /* !CONFIG_SMALL */
2398                 break;
2399         case PIX_FMT_RGB24:
2400             *yuv2packed1 = yuv2rgb24_1_c;
2401             *yuv2packed2 = yuv2rgb24_2_c;
2402             *yuv2packedX = yuv2rgb24_X_c;
2403             break;
2404         case PIX_FMT_BGR24:
2405             *yuv2packed1 = yuv2bgr24_1_c;
2406             *yuv2packed2 = yuv2bgr24_2_c;
2407             *yuv2packedX = yuv2bgr24_X_c;
2408             break;
2409         case PIX_FMT_RGB565LE:
2410         case PIX_FMT_RGB565BE:
2411         case PIX_FMT_BGR565LE:
2412         case PIX_FMT_BGR565BE:
2413             *yuv2packed1 = yuv2rgb16_1_c;
2414             *yuv2packed2 = yuv2rgb16_2_c;
2415             *yuv2packedX = yuv2rgb16_X_c;
2416             break;
2417         case PIX_FMT_RGB555LE:
2418         case PIX_FMT_RGB555BE:
2419         case PIX_FMT_BGR555LE:
2420         case PIX_FMT_BGR555BE:
2421             *yuv2packed1 = yuv2rgb15_1_c;
2422             *yuv2packed2 = yuv2rgb15_2_c;
2423             *yuv2packedX = yuv2rgb15_X_c;
2424             break;
2425         case PIX_FMT_RGB444LE:
2426         case PIX_FMT_RGB444BE:
2427         case PIX_FMT_BGR444LE:
2428         case PIX_FMT_BGR444BE:
2429             *yuv2packed1 = yuv2rgb12_1_c;
2430             *yuv2packed2 = yuv2rgb12_2_c;
2431             *yuv2packedX = yuv2rgb12_X_c;
2432             break;
2433         case PIX_FMT_RGB8:
2434         case PIX_FMT_BGR8:
2435             *yuv2packed1 = yuv2rgb8_1_c;
2436             *yuv2packed2 = yuv2rgb8_2_c;
2437             *yuv2packedX = yuv2rgb8_X_c;
2438             break;
2439         case PIX_FMT_RGB4:
2440         case PIX_FMT_BGR4:
2441             *yuv2packed1 = yuv2rgb4_1_c;
2442             *yuv2packed2 = yuv2rgb4_2_c;
2443             *yuv2packedX = yuv2rgb4_X_c;
2444             break;
2445         case PIX_FMT_RGB4_BYTE:
2446         case PIX_FMT_BGR4_BYTE:
2447             *yuv2packed1 = yuv2rgb4b_1_c;
2448             *yuv2packed2 = yuv2rgb4b_2_c;
2449             *yuv2packedX = yuv2rgb4b_X_c;
2450             break;
2451         }
2452     }
2453 }
2454
2455 #define DEBUG_SWSCALE_BUFFERS 0
2456 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2457
2458 static int swScale(SwsContext *c, const uint8_t* src[],
2459                    int srcStride[], int srcSliceY,
2460                    int srcSliceH, uint8_t* dst[], int dstStride[])
2461 {
2462     /* load a few things into local vars to make the code more readable? and faster */
2463     const int srcW= c->srcW;
2464     const int dstW= c->dstW;
2465     const int dstH= c->dstH;
2466     const int chrDstW= c->chrDstW;
2467     const int chrSrcW= c->chrSrcW;
2468     const int lumXInc= c->lumXInc;
2469     const int chrXInc= c->chrXInc;
2470     const enum PixelFormat dstFormat= c->dstFormat;
2471     const int flags= c->flags;
2472     int16_t *vLumFilterPos= c->vLumFilterPos;
2473     int16_t *vChrFilterPos= c->vChrFilterPos;
2474     int16_t *hLumFilterPos= c->hLumFilterPos;
2475     int16_t *hChrFilterPos= c->hChrFilterPos;
2476     int16_t *hLumFilter= c->hLumFilter;
2477     int16_t *hChrFilter= c->hChrFilter;
2478     int32_t *lumMmxFilter= c->lumMmxFilter;
2479     int32_t *chrMmxFilter= c->chrMmxFilter;
2480     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2481     const int vLumFilterSize= c->vLumFilterSize;
2482     const int vChrFilterSize= c->vChrFilterSize;
2483     const int hLumFilterSize= c->hLumFilterSize;
2484     const int hChrFilterSize= c->hChrFilterSize;
2485     int16_t **lumPixBuf= c->lumPixBuf;
2486     int16_t **chrUPixBuf= c->chrUPixBuf;
2487     int16_t **chrVPixBuf= c->chrVPixBuf;
2488     int16_t **alpPixBuf= c->alpPixBuf;
2489     const int vLumBufSize= c->vLumBufSize;
2490     const int vChrBufSize= c->vChrBufSize;
2491     uint8_t *formatConvBuffer= c->formatConvBuffer;
2492     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2493     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2494     int lastDstY;
2495     uint32_t *pal=c->pal_yuv;
2496     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2497
2498     yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
2499     yuv2planarX_fn yuv2planeX = c->yuv2planeX;
2500     yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
2501     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2502     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2503     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2504
2505     /* vars which will change and which we need to store back in the context */
2506     int dstY= c->dstY;
2507     int lumBufIndex= c->lumBufIndex;
2508     int chrBufIndex= c->chrBufIndex;
2509     int lastInLumBuf= c->lastInLumBuf;
2510     int lastInChrBuf= c->lastInChrBuf;
2511
2512     if (isPacked(c->srcFormat)) {
2513         src[0]=
2514         src[1]=
2515         src[2]=
2516         src[3]= src[0];
2517         srcStride[0]=
2518         srcStride[1]=
2519         srcStride[2]=
2520         srcStride[3]= srcStride[0];
2521     }
2522     srcStride[1]<<= c->vChrDrop;
2523     srcStride[2]<<= c->vChrDrop;
2524
2525     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2526                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2527                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2528     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2529                    srcSliceY,    srcSliceH,    dstY,    dstH);
2530     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2531                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2532
2533     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2534         static int warnedAlready=0; //FIXME move this into the context perhaps
2535         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2536             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2537                    "         ->cannot do aligned memory accesses anymore\n");
2538             warnedAlready=1;
2539         }
2540     }
2541
2542     /* Note the user might start scaling the picture in the middle so this
2543        will not get executed. This is not really intended but works
2544        currently, so people might do it. */
2545     if (srcSliceY ==0) {
2546         lumBufIndex=-1;
2547         chrBufIndex=-1;
2548         dstY=0;
2549         lastInLumBuf= -1;
2550         lastInChrBuf= -1;
2551     }
2552
2553     if (!should_dither) {
2554         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2555     }
2556     lastDstY= dstY;
2557
2558     for (;dstY < dstH; dstY++) {
2559         const int chrDstY= dstY>>c->chrDstVSubSample;
2560         uint8_t *dest[4] = {
2561             dst[0] + dstStride[0] * dstY,
2562             dst[1] + dstStride[1] * chrDstY,
2563             dst[2] + dstStride[2] * chrDstY,
2564             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2565         };
2566         int use_mmx_vfilter= c->use_mmx_vfilter;
2567
2568         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2569         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2570         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2571         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2572         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2573         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2574         int enough_lines;
2575
2576         //handle holes (FAST_BILINEAR & weird filters)
2577         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2578         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2579         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2580         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2581
2582         DEBUG_BUFFERS("dstY: %d\n", dstY);
2583         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2584                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2585         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2586                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2587
2588         // Do we have enough lines in this slice to output the dstY line
2589         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2590
2591         if (!enough_lines) {
2592             lastLumSrcY = srcSliceY + srcSliceH - 1;
2593             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2594             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2595                                             lastLumSrcY, lastChrSrcY);
2596         }
2597
2598         //Do horizontal scaling
2599         while(lastInLumBuf < lastLumSrcY) {
2600             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2601             const uint8_t *src2= src[1]+(lastInLumBuf + 1 - srcSliceY)*srcStride[1];
2602             const uint8_t *src3= src[2]+(lastInLumBuf + 1 - srcSliceY)*srcStride[2];
2603             const uint8_t *src4= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2604             lumBufIndex++;
2605             assert(lumBufIndex < 2*vLumBufSize);
2606             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2607             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2608             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, src2, src3, srcW, lumXInc,
2609                     hLumFilter, hLumFilterPos, hLumFilterSize,
2610                     formatConvBuffer,
2611                     pal, 0);
2612             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2613                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src4, NULL, NULL, srcW,
2614                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2615                         formatConvBuffer,
2616                         pal, 1);
2617             lastInLumBuf++;
2618             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2619                                lumBufIndex,    lastInLumBuf);
2620         }
2621         while(lastInChrBuf < lastChrSrcY) {
2622             const uint8_t *src0= src[0]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[0];
2623             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2624             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2625             chrBufIndex++;
2626             assert(chrBufIndex < 2*vChrBufSize);
2627             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2628             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2629             //FIXME replace parameters through context struct (some at least)
2630
2631             if (c->needs_hcscale)
2632                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2633                           chrDstW, src0, src1, src2, chrSrcW, chrXInc,
2634                           hChrFilter, hChrFilterPos, hChrFilterSize,
2635                           formatConvBuffer, pal);
2636             lastInChrBuf++;
2637             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2638                                chrBufIndex,    lastInChrBuf);
2639         }
2640         //wrap buf index around to stay inside the ring buffer
2641         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2642         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2643         if (!enough_lines)
2644             break; //we can't output a dstY line so let's try with the next slice
2645
2646 #if HAVE_MMX
2647         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2648 #endif
2649         if (should_dither) {
2650             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2651             c->lumDither8 = dither_8x8_128[dstY & 7];
2652         }
2653         if (dstY >= dstH-2) {
2654             // hmm looks like we can't use MMX here without overwriting this array's tail
2655             find_c_packed_planar_out_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
2656                                            &yuv2packed1, &yuv2packed2, &yuv2packedX);
2657             use_mmx_vfilter= 0;
2658         }
2659
2660         {
2661             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2662             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2663             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2664             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2665             int16_t *vLumFilter= c->vLumFilter;
2666             int16_t *vChrFilter= c->vChrFilter;
2667
2668             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2669                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2670
2671                 vLumFilter +=    dstY * vLumFilterSize;
2672                 vChrFilter += chrDstY * vChrFilterSize;
2673
2674                 av_assert0(use_mmx_vfilter != (
2675                                yuv2planeX == yuv2planeX_10BE_c
2676                             || yuv2planeX == yuv2planeX_10LE_c
2677                             || yuv2planeX == yuv2planeX_9BE_c
2678                             || yuv2planeX == yuv2planeX_9LE_c
2679                             || yuv2planeX == yuv2planeX_16BE_c
2680                             || yuv2planeX == yuv2planeX_16LE_c
2681                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
2682
2683                 if(use_mmx_vfilter){
2684                     vLumFilter= c->lumMmxFilter;
2685                     vChrFilter= c->chrMmxFilter;
2686                 }
2687
2688                 if (vLumFilterSize == 1) {
2689                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
2690                 } else {
2691                     yuv2planeX(vLumFilter, vLumFilterSize,
2692                                lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
2693                 }
2694
2695                 if (!((dstY&chrSkipMask) || isGray(dstFormat))) {
2696                     if (yuv2nv12cX) {
2697                         yuv2nv12cX(c, vChrFilter, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
2698                     } else if (vChrFilterSize == 1) {
2699                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
2700                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
2701                     } else {
2702                         yuv2planeX(vChrFilter, vChrFilterSize,
2703                                    chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
2704                         yuv2planeX(vChrFilter, vChrFilterSize,
2705                                    chrVSrcPtr, dest[2], chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
2706                     }
2707                 }
2708
2709                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2710                     if(use_mmx_vfilter){
2711                         vLumFilter= c->alpMmxFilter;
2712                     }
2713                     if (vLumFilterSize == 1) {
2714                         yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0);
2715                     } else {
2716                         yuv2planeX(vLumFilter, vLumFilterSize,
2717                                    alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
2718                     }
2719                 }
2720             } else {
2721                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2722                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2723                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2724                     int chrAlpha = vChrFilter[2 * dstY + 1];
2725                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2726                                 alpPixBuf ? *alpSrcPtr : NULL,
2727                                 dest[0], dstW, chrAlpha, dstY);
2728                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2729                     int lumAlpha = vLumFilter[2 * dstY + 1];
2730                     int chrAlpha = vChrFilter[2 * dstY + 1];
2731                     lumMmxFilter[2] =
2732                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2733                     chrMmxFilter[2] =
2734                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2735                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2736                                 alpPixBuf ? alpSrcPtr : NULL,
2737                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2738                 } else { //general RGB
2739                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2740                                 lumSrcPtr, vLumFilterSize,
2741                                 vChrFilter + dstY * vChrFilterSize,
2742                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2743                                 alpSrcPtr, dest[0], dstW, dstY);
2744                 }
2745             }
2746         }
2747     }
2748
2749     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2750         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2751
2752 #if HAVE_MMX2
2753     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2754         __asm__ volatile("sfence":::"memory");
2755 #endif
2756     emms_c();
2757
2758     /* store changed local vars back in the context */
2759     c->dstY= dstY;
2760     c->lumBufIndex= lumBufIndex;
2761     c->chrBufIndex= chrBufIndex;
2762     c->lastInLumBuf= lastInLumBuf;
2763     c->lastInChrBuf= lastInChrBuf;
2764
2765     return dstY - lastDstY;
2766 }
2767
2768 static av_cold void sws_init_swScale_c(SwsContext *c)
2769 {
2770     enum PixelFormat srcFormat = c->srcFormat;
2771
2772     find_c_packed_planar_out_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
2773                                    &c->yuv2nv12cX, &c->yuv2packed1, &c->yuv2packed2,
2774                                    &c->yuv2packedX);
2775
2776     c->chrToYV12 = NULL;
2777     switch(srcFormat) {
2778         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2779         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2780         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2781         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2782         case PIX_FMT_RGB8     :
2783         case PIX_FMT_BGR8     :
2784         case PIX_FMT_PAL8     :
2785         case PIX_FMT_BGR4_BYTE:
2786         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2787 #if HAVE_BIGENDIAN
2788         case PIX_FMT_YUV444P9LE:
2789         case PIX_FMT_YUV422P9LE:
2790         case PIX_FMT_YUV420P9LE:
2791         case PIX_FMT_YUV422P10LE:
2792         case PIX_FMT_YUV420P10LE:
2793         case PIX_FMT_YUV444P10LE:
2794         case PIX_FMT_YUV420P16LE:
2795         case PIX_FMT_YUV422P16LE:
2796         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2797 #else
2798         case PIX_FMT_YUV444P9BE:
2799         case PIX_FMT_YUV422P9BE:
2800         case PIX_FMT_YUV420P9BE:
2801         case PIX_FMT_YUV444P10BE:
2802         case PIX_FMT_YUV422P10BE:
2803         case PIX_FMT_YUV420P10BE:
2804         case PIX_FMT_YUV420P16BE:
2805         case PIX_FMT_YUV422P16BE:
2806         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2807 #endif
2808     }
2809     if (c->chrSrcHSubSample) {
2810         switch(srcFormat) {
2811         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2812         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2813         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2814         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2815         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2816         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2817         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2818         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2819         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2820         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2821         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2822         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2823         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2824         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2825         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2826         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2827         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2828         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2829         case PIX_FMT_GBR24P  : c->chrToYV12 = gbr24pToUV_half_c;  break;
2830         }
2831     } else {
2832         switch(srcFormat) {
2833         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2834         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2835         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2836         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2837         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2838         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2839         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2840         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2841         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2842         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2843         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2844         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2845         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2846         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2847         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2848         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2849         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2850         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2851         case PIX_FMT_GBR24P  : c->chrToYV12 = gbr24pToUV_c;  break;
2852         }
2853     }
2854
2855     c->lumToYV12 = NULL;
2856     c->alpToYV12 = NULL;
2857     switch (srcFormat) {
2858 #if HAVE_BIGENDIAN
2859     case PIX_FMT_YUV444P9LE:
2860     case PIX_FMT_YUV422P9LE:
2861     case PIX_FMT_YUV420P9LE:
2862     case PIX_FMT_YUV422P10LE:
2863     case PIX_FMT_YUV420P10LE:
2864     case PIX_FMT_YUV444P10LE:
2865     case PIX_FMT_YUV420P16LE:
2866     case PIX_FMT_YUV422P16LE:
2867     case PIX_FMT_YUV444P16LE:
2868     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2869 #else
2870     case PIX_FMT_YUV444P9BE:
2871     case PIX_FMT_YUV422P9BE:
2872     case PIX_FMT_YUV420P9BE:
2873     case PIX_FMT_YUV444P10BE:
2874     case PIX_FMT_YUV422P10BE:
2875     case PIX_FMT_YUV420P10BE:
2876     case PIX_FMT_YUV420P16BE:
2877     case PIX_FMT_YUV422P16BE:
2878     case PIX_FMT_YUV444P16BE:
2879     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2880 #endif
2881     case PIX_FMT_YUYV422  :
2882     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2883     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2884     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2885     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2886     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2887     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2888     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2889     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2890     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2891     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2892     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2893     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2894     case PIX_FMT_RGB8     :
2895     case PIX_FMT_BGR8     :
2896     case PIX_FMT_PAL8     :
2897     case PIX_FMT_BGR4_BYTE:
2898     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2899     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2900     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2901     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2902     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2903     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2904     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2905     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2906     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2907     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2908     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2909     case PIX_FMT_GBR24P : c->lumToYV12 = gbr24pToY_c ; break;
2910     }
2911     if (c->alpPixBuf) {
2912         switch (srcFormat) {
2913         case PIX_FMT_BGRA:
2914         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2915         case PIX_FMT_ABGR:
2916         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2917         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2918         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
2919         }
2920     }
2921
2922
2923     if (c->srcBpc == 8) {
2924         if (c->dstBpc <= 10) {
2925             c->hyScale = c->hcScale = hScale8To15_c;
2926             if (c->flags & SWS_FAST_BILINEAR) {
2927                 c->hyscale_fast = hyscale_fast_c;
2928                 c->hcscale_fast = hcscale_fast_c;
2929             }
2930         } else {
2931             c->hyScale = c->hcScale = hScale8To19_c;
2932         }
2933     } else {
2934         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
2935     }
2936
2937     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2938         if (c->dstBpc <= 10) {
2939             if (c->srcRange) {
2940                 c->lumConvertRange = lumRangeFromJpeg_c;
2941                 c->chrConvertRange = chrRangeFromJpeg_c;
2942             } else {
2943                 c->lumConvertRange = lumRangeToJpeg_c;
2944                 c->chrConvertRange = chrRangeToJpeg_c;
2945             }
2946         } else {
2947             if (c->srcRange) {
2948                 c->lumConvertRange = lumRangeFromJpeg16_c;
2949                 c->chrConvertRange = chrRangeFromJpeg16_c;
2950             } else {
2951                 c->lumConvertRange = lumRangeToJpeg16_c;
2952                 c->chrConvertRange = chrRangeToJpeg16_c;
2953             }
2954         }
2955     }
2956
2957     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2958           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2959         c->needs_hcscale = 1;
2960 }
2961
2962 SwsFunc ff_getSwsFunc(SwsContext *c)
2963 {
2964     sws_init_swScale_c(c);
2965
2966     if (HAVE_MMX)
2967         ff_sws_init_swScale_mmx(c);
2968     if (HAVE_ALTIVEC)
2969         ff_sws_init_swScale_altivec(c);
2970
2971     return swScale;
2972 }