git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/avassert.h"
  64 #include "libavutil/intreadwrite.h"
  65 #include "libavutil/cpu.h"
  66 #include "libavutil/avutil.h"
  67 #include "libavutil/mathematics.h"
  68 #include "libavutil/bswap.h"
  69 #include "libavutil/pixdesc.h"
  70
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 186 {  36, 68, 60, 92, 34, 66, 58, 90,},
 187 { 100,  4,124, 28, 98,  2,122, 26,},
 188 {  52, 84, 44, 76, 50, 82, 42, 74,},
 189 { 116, 20,108, 12,114, 18,106, 10,},
 190 {  32, 64, 56, 88, 38, 70, 62, 94,},
 191 {  96,  0,120, 24,102,  6,126, 30,},
 192 {  48, 80, 40, 72, 54, 86, 46, 78,},
 193 { 112, 16,104,  8,118, 22,110, 14,},
 194 };
 195 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 196 {  64, 64, 64, 64, 64, 64, 64, 64 };
 197
 198 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 199 {
 200   {   0,  1,  0,  1,  0,  1,  0,  1,},
 201   {   1,  0,  1,  0,  1,  0,  1,  0,},
 202   {   0,  1,  0,  1,  0,  1,  0,  1,},
 203   {   1,  0,  1,  0,  1,  0,  1,  0,},
 204   {   0,  1,  0,  1,  0,  1,  0,  1,},
 205   {   1,  0,  1,  0,  1,  0,  1,  0,},
 206   {   0,  1,  0,  1,  0,  1,  0,  1,},
 207   {   1,  0,  1,  0,  1,  0,  1,  0,},
 208 },{
 209   {   1,  2,  1,  2,  1,  2,  1,  2,},
 210   {   3,  0,  3,  0,  3,  0,  3,  0,},
 211   {   1,  2,  1,  2,  1,  2,  1,  2,},
 212   {   3,  0,  3,  0,  3,  0,  3,  0,},
 213   {   1,  2,  1,  2,  1,  2,  1,  2,},
 214   {   3,  0,  3,  0,  3,  0,  3,  0,},
 215   {   1,  2,  1,  2,  1,  2,  1,  2,},
 216   {   3,  0,  3,  0,  3,  0,  3,  0,},
 217 },{
 218   {   2,  4,  3,  5,  2,  4,  3,  5,},
 219   {   6,  0,  7,  1,  6,  0,  7,  1,},
 220   {   3,  5,  2,  4,  3,  5,  2,  4,},
 221   {   7,  1,  6,  0,  7,  1,  6,  0,},
 222   {   2,  4,  3,  5,  2,  4,  3,  5,},
 223   {   6,  0,  7,  1,  6,  0,  7,  1,},
 224   {   3,  5,  2,  4,  3,  5,  2,  4,},
 225   {   7,  1,  6,  0,  7,  1,  6,  0,},
 226 },{
 227   {   4,  8,  7, 11,  4,  8,  7, 11,},
 228   {  12,  0, 15,  3, 12,  0, 15,  3,},
 229   {   6, 10,  5,  9,  6, 10,  5,  9,},
 230   {  14,  2, 13,  1, 14,  2, 13,  1,},
 231   {   4,  8,  7, 11,  4,  8,  7, 11,},
 232   {  12,  0, 15,  3, 12,  0, 15,  3,},
 233   {   6, 10,  5,  9,  6, 10,  5,  9,},
 234   {  14,  2, 13,  1, 14,  2, 13,  1,},
 235 },{
 236   {   9, 17, 15, 23,  8, 16, 14, 22,},
 237   {  25,  1, 31,  7, 24,  0, 30,  6,},
 238   {  13, 21, 11, 19, 12, 20, 10, 18,},
 239   {  29,  5, 27,  3, 28,  4, 26,  2,},
 240   {   8, 16, 14, 22,  9, 17, 15, 23,},
 241   {  24,  0, 30,  6, 25,  1, 31,  7,},
 242   {  12, 20, 10, 18, 13, 21, 11, 19,},
 243   {  28,  4, 26,  2, 29,  5, 27,  3,},
 244 },{
 245   {  18, 34, 30, 46, 17, 33, 29, 45,},
 246   {  50,  2, 62, 14, 49,  1, 61, 13,},
 247   {  26, 42, 22, 38, 25, 41, 21, 37,},
 248   {  58, 10, 54,  6, 57,  9, 53,  5,},
 249   {  16, 32, 28, 44, 19, 35, 31, 47,},
 250   {  48,  0, 60, 12, 51,  3, 63, 15,},
 251   {  24, 40, 20, 36, 27, 43, 23, 39,},
 252   {  56,  8, 52,  4, 59, 11, 55,  7,},
 253 },{
 254   {  18, 34, 30, 46, 17, 33, 29, 45,},
 255   {  50,  2, 62, 14, 49,  1, 61, 13,},
 256   {  26, 42, 22, 38, 25, 41, 21, 37,},
 257   {  58, 10, 54,  6, 57,  9, 53,  5,},
 258   {  16, 32, 28, 44, 19, 35, 31, 47,},
 259   {  48,  0, 60, 12, 51,  3, 63, 15,},
 260   {  24, 40, 20, 36, 27, 43, 23, 39,},
 261   {  56,  8, 52,  4, 59, 11, 55,  7,},
 262 },{
 263   {  36, 68, 60, 92, 34, 66, 58, 90,},
 264   { 100,  4,124, 28, 98,  2,122, 26,},
 265   {  52, 84, 44, 76, 50, 82, 42, 74,},
 266   { 116, 20,108, 12,114, 18,106, 10,},
 267   {  32, 64, 56, 88, 38, 70, 62, 94,},
 268   {  96,  0,120, 24,102,  6,126, 30,},
 269   {  48, 80, 40, 72, 54, 86, 46, 78,},
 270   { 112, 16,104,  8,118, 22,110, 14,},
 271 }};
 272
 273 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 274
 275 const uint16_t dither_scale[15][16]={
 276 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 277 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 278 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 279 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 280 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 281 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 282 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 283 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 284 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 285 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 286 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 287 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 288 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 289 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 290 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 291 };
 292
 293 static av_always_inline void
 294 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 295                       int lumFilterSize, const int16_t *chrFilter,
 296                       const int32_t **chrUSrc, const int32_t **chrVSrc,
 297                       int chrFilterSize, const int32_t **alpSrc,
 298                       uint16_t *dest[4], int dstW, int chrDstW,
 299                       int big_endian, int output_bits)
 300 {
 301     //FIXME Optimize (just quickly written not optimized..)
 302     int i;
 303     int dword= output_bits == 16;
 304     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 305              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 306     int shift = 11 + 4*dword + 16 - output_bits - 1;
 307
 308 #define output_pixel(pos, val) \
 309     if (big_endian) { \
 310         AV_WB16(pos, av_clip_uint16(val >> shift)); \
 311     } else { \
 312         AV_WL16(pos, av_clip_uint16(val >> shift)); \
 313     }
 314     for (i = 0; i < dstW; i++) {
 315         int val = 1 << (26-output_bits + 4*dword - 1);
 316         int j;
 317
 318         for (j = 0; j < lumFilterSize; j++)
 319             val += ((dword ? lumSrc[j][i] : ((int16_t**)lumSrc)[j][i]) * lumFilter[j])>>1;
 320
 321         output_pixel(&yDest[i], val);
 322     }
 323
 324     if (uDest) {
 325         for (i = 0; i < chrDstW; i++) {
 326             int u = 1 << (26-output_bits + 4*dword - 1);
 327             int v = 1 << (26-output_bits + 4*dword - 1);
 328             int j;
 329
 330             for (j = 0; j < chrFilterSize; j++) {
 331                 u += ((dword ? chrUSrc[j][i] : ((int16_t**)chrUSrc)[j][i]) * chrFilter[j]) >> 1;
 332                 v += ((dword ? chrVSrc[j][i] : ((int16_t**)chrVSrc)[j][i]) * chrFilter[j]) >> 1;
 333             }
 334
 335             output_pixel(&uDest[i], u);
 336             output_pixel(&vDest[i], v);
 337         }
 338     }
 339
 340     if (CONFIG_SWSCALE_ALPHA && aDest) {
 341         for (i = 0; i < dstW; i++) {
 342             int val = 1 << (26-output_bits + 4*dword - 1);
 343             int j;
 344
 345             for (j = 0; j < lumFilterSize; j++)
 346                 val += ((dword ? alpSrc[j][i] : ((int16_t**)alpSrc)[j][i]) * lumFilter[j]) >> 1;
 347
 348             output_pixel(&aDest[i], val);
 349         }
 350     }
 351 #undef output_pixel
 352 }
 353
 354 static av_always_inline void
 355 yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 356                       int lumFilterSize, const int16_t *chrFilter,
 357                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 358                       int chrFilterSize, const int16_t **alpSrc,
 359                       uint16_t *dest[4], int dstW, int chrDstW,
 360                       int big_endian, int output_bits)
 361 {
 362     //FIXME Optimize (just quickly written not optimized..)
 363     int i;
 364     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 365              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 366     int shift = 11 + 16 - output_bits - 1;
 367
 368 #define output_pixel(pos, val) \
 369     if (big_endian) { \
 370         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 371     } else { \
 372         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 373     }
 374     for (i = 0; i < dstW; i++) {
 375         int val = 1 << (26-output_bits - 1);
 376         int j;
 377
 378         for (j = 0; j < lumFilterSize; j++)
 379             val += (lumSrc[j][i] * lumFilter[j]) >> 1;
 380
 381         output_pixel(&yDest[i], val);
 382     }
 383
 384     if (uDest) {
 385         for (i = 0; i < chrDstW; i++) {
 386             int u = 1 << (26-output_bits - 1);
 387             int v = 1 << (26-output_bits - 1);
 388             int j;
 389
 390             for (j = 0; j < chrFilterSize; j++) {
 391                 u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
 392                 v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
 393             }
 394
 395             output_pixel(&uDest[i], u);
 396             output_pixel(&vDest[i], v);
 397         }
 398     }
 399
 400     if (CONFIG_SWSCALE_ALPHA && aDest) {
 401         for (i = 0; i < dstW; i++) {
 402             int val = 1 << (26-output_bits - 1);
 403             int j;
 404
 405             for (j = 0; j < lumFilterSize; j++)
 406                 val += (alpSrc[j][i] * lumFilter[j]) >> 1;
 407
 408             output_pixel(&aDest[i], val);
 409         }
 410     }
 411 #undef output_pixel
 412 }
 413
 414 #define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \
 415 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 416                               const int16_t **_lumSrc, int lumFilterSize, \
 417                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
 418                               const int16_t **_chrVSrc, \
 419                               int chrFilterSize, const int16_t **_alpSrc, \
 420                               uint8_t *_dest[4], int dstW, int chrDstW) \
 421 { \
 422     const typeX_t **lumSrc  = (const typeX_t **) _lumSrc, \
 423                   **chrUSrc = (const typeX_t **) _chrUSrc, \
 424                   **chrVSrc = (const typeX_t **) _chrVSrc, \
 425                   **alpSrc  = (const typeX_t **) _alpSrc; \
 426     yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \
 427                          chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 428                          alpSrc, (uint16_t **) _dest, \
 429                          dstW, chrDstW, is_be, bits); \
 430 }
 431 yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t);
 432 yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t);
 433 yuv2NBPS(10, BE, 1, yuv2yuvX10_c_template, int16_t);
 434 yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t);
 435 yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t);
 436 yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t);
 437
 438 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 439                        const int16_t **lumSrc, int lumFilterSize,
 440                        const int16_t *chrFilter, const int16_t **chrUSrc,
 441                        const int16_t **chrVSrc,
 442                        int chrFilterSize, const int16_t **alpSrc,
 443                        uint8_t *dest[4], int dstW, int chrDstW)
 444 {
 445     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 446             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 447     int i;
 448     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 449
 450     //FIXME Optimize (just quickly written not optimized..)
 451     for (i=0; i<dstW; i++) {
 452         int val = lumDither[i & 7] << 12;
 453         int j;
 454         for (j=0; j<lumFilterSize; j++)
 455             val += lumSrc[j][i] * lumFilter[j];
 456
 457         yDest[i]= av_clip_uint8(val>>19);
 458     }
 459
 460     if (uDest)
 461         for (i=0; i<chrDstW; i++) {
 462             int u = chrDither[i & 7] << 12;
 463             int v = chrDither[(i + 3) & 7] << 12;
 464             int j;
 465             for (j=0; j<chrFilterSize; j++) {
 466                 u += chrUSrc[j][i] * chrFilter[j];
 467                 v += chrVSrc[j][i] * chrFilter[j];
 468             }
 469
 470             uDest[i]= av_clip_uint8(u>>19);
 471             vDest[i]= av_clip_uint8(v>>19);
 472         }
 473
 474     if (CONFIG_SWSCALE_ALPHA && aDest)
 475         for (i=0; i<dstW; i++) {
 476             int val = lumDither[i & 7] << 12;
 477             int j;
 478             for (j=0; j<lumFilterSize; j++)
 479                 val += alpSrc[j][i] * lumFilter[j];
 480
 481             aDest[i]= av_clip_uint8(val>>19);
 482         }
 483 }
 484
 485 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 486                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 487                        const int16_t *alpSrc,
 488                        uint8_t *dest[4], int dstW, int chrDstW)
 489 {
 490     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 491             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 492     int i;
 493     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 494
 495     for (i=0; i<dstW; i++) {
 496         int val = (lumSrc[i]+  lumDither[i & 7]) >> 7;
 497         yDest[i]= av_clip_uint8(val);
 498     }
 499
 500     if (uDest)
 501         for (i=0; i<chrDstW; i++) {
 502             int u = (chrUSrc[i] + chrDither[i & 7])       >> 7;
 503             int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7;
 504             uDest[i]= av_clip_uint8(u);
 505             vDest[i]= av_clip_uint8(v);
 506         }
 507
 508     if (CONFIG_SWSCALE_ALPHA && aDest)
 509         for (i=0; i<dstW; i++) {
 510             int val = (alpSrc[i] + lumDither[i & 7]) >> 7;
 511             aDest[i]= av_clip_uint8(val);
 512         }
 513 }
 514
 515 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 516                         const int16_t **lumSrc, int lumFilterSize,
 517                         const int16_t *chrFilter, const int16_t **chrUSrc,
 518                         const int16_t **chrVSrc, int chrFilterSize,
 519                         const int16_t **alpSrc, uint8_t *dest[4],
 520                         int dstW, int chrDstW)
 521 {
 522     uint8_t *yDest = dest[0], *uDest = dest[1];
 523     enum PixelFormat dstFormat = c->dstFormat;
 524     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 525
 526     //FIXME Optimize (just quickly written not optimized..)
 527     int i;
 528     for (i=0; i<dstW; i++) {
 529         int val = lumDither[i & 7] << 12;
 530         int j;
 531         for (j=0; j<lumFilterSize; j++)
 532             val += lumSrc[j][i] * lumFilter[j];
 533
 534         yDest[i]= av_clip_uint8(val>>19);
 535     }
 536
 537     if (!uDest)
 538         return;
 539
 540     if (dstFormat == PIX_FMT_NV12)
 541         for (i=0; i<chrDstW; i++) {
 542             int u = chrDither[i & 7] << 12;
 543             int v = chrDither[(i + 3) & 7] << 12;
 544             int j;
 545             for (j=0; j<chrFilterSize; j++) {
 546                 u += chrUSrc[j][i] * chrFilter[j];
 547                 v += chrVSrc[j][i] * chrFilter[j];
 548             }
 549
 550             uDest[2*i]= av_clip_uint8(u>>19);
 551             uDest[2*i+1]= av_clip_uint8(v>>19);
 552         }
 553     else
 554         for (i=0; i<chrDstW; i++) {
 555             int u = chrDither[i & 7] << 12;
 556             int v = chrDither[(i + 3) & 7] << 12;
 557             int j;
 558             for (j=0; j<chrFilterSize; j++) {
 559                 u += chrUSrc[j][i] * chrFilter[j];
 560                 v += chrVSrc[j][i] * chrFilter[j];
 561             }
 562
 563             uDest[2*i]= av_clip_uint8(v>>19);
 564             uDest[2*i+1]= av_clip_uint8(u>>19);
 565         }
 566 }
 567
 568 #define output_pixel(pos, val) \
 569         if (target == PIX_FMT_GRAY16BE) { \
 570             AV_WB16(pos, val); \
 571         } else { \
 572             AV_WL16(pos, val); \
 573         }
 574
 575 static av_always_inline void
 576 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 577                         const int32_t **lumSrc, int lumFilterSize,
 578                         const int16_t *chrFilter, const int32_t **chrUSrc,
 579                         const int32_t **chrVSrc, int chrFilterSize,
 580                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 581                         int y, enum PixelFormat target)
 582 {
 583     int i;
 584
 585     for (i = 0; i < (dstW >> 1); i++) {
 586         int j;
 587         int Y1 = 1 << 14;
 588         int Y2 = 1 << 14;
 589
 590         for (j = 0; j < lumFilterSize; j++) {
 591             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 592             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 593         }
 594         Y1 >>= 15;
 595         Y2 >>= 15;
 596         if ((Y1 | Y2) & 0x10000) {
 597             Y1 = av_clip_uint16(Y1);
 598             Y2 = av_clip_uint16(Y2);
 599         }
 600         output_pixel(&dest[i * 2 + 0], Y1);
 601         output_pixel(&dest[i * 2 + 1], Y2);
 602     }
 603 }
 604
 605 static av_always_inline void
 606 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 607                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 608                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 609                         int yalpha, int uvalpha, int y,
 610                         enum PixelFormat target)
 611 {
 612     int  yalpha1 = 4095 - yalpha;
 613     int i;
 614     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 615
 616     for (i = 0; i < (dstW >> 1); i++) {
 617         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 618         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 619
 620         output_pixel(&dest[i * 2 + 0], Y1);
 621         output_pixel(&dest[i * 2 + 1], Y2);
 622     }
 623 }
 624
 625 static av_always_inline void
 626 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 627                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 628                         const int32_t *abuf0, uint16_t *dest, int dstW,
 629                         int uvalpha, int y, enum PixelFormat target)
 630 {
 631     int i;
 632
 633     for (i = 0; i < (dstW >> 1); i++) {
 634         int Y1 = (buf0[i * 2    ]+4)>>3;
 635         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 636
 637         output_pixel(&dest[i * 2 + 0], Y1);
 638         output_pixel(&dest[i * 2 + 1], Y2);
 639     }
 640 }
 641
 642 #undef output_pixel
 643
 644 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 645 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 646                         const int16_t **_lumSrc, int lumFilterSize, \
 647                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 648                         const int16_t **_chrVSrc, int chrFilterSize, \
 649                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 650                         int y) \
 651 { \
 652     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 653                   **chrUSrc = (const int32_t **) _chrUSrc, \
 654                   **chrVSrc = (const int32_t **) _chrVSrc, \
 655                   **alpSrc  = (const int32_t **) _alpSrc; \
 656     uint16_t *dest = (uint16_t *) _dest; \
 657     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 658                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 659                           alpSrc, dest, dstW, y, fmt); \
 660 } \
 661  \
 662 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 663                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 664                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 665                         int yalpha, int uvalpha, int y) \
 666 { \
 667     const int32_t **buf  = (const int32_t **) _buf, \
 668                   **ubuf = (const int32_t **) _ubuf, \
 669                   **vbuf = (const int32_t **) _vbuf, \
 670                   **abuf = (const int32_t **) _abuf; \
 671     uint16_t *dest = (uint16_t *) _dest; \
 672     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 673                           dest, dstW, yalpha, uvalpha, y, fmt); \
 674 } \
 675  \
 676 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 677                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 678                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 679                         int uvalpha, int y) \
 680 { \
 681     const int32_t *buf0  = (const int32_t *)  _buf0, \
 682                  **ubuf  = (const int32_t **) _ubuf, \
 683                  **vbuf  = (const int32_t **) _vbuf, \
 684                   *abuf0 = (const int32_t *)  _abuf0; \
 685     uint16_t *dest = (uint16_t *) _dest; \
 686     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 687                                   dstW, uvalpha, y, fmt); \
 688 }
 689
 690 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 691 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 692
 693 #define output_pixel(pos, acc) \
 694     if (target == PIX_FMT_MONOBLACK) { \
 695         pos = acc; \
 696     } else { \
 697         pos = ~acc; \
 698     }
 699
 700 static av_always_inline void
 701 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 702                       const int16_t **lumSrc, int lumFilterSize,
 703                       const int16_t *chrFilter, const int16_t **chrUSrc,
 704                       const int16_t **chrVSrc, int chrFilterSize,
 705                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 706                       int y, enum PixelFormat target)
 707 {
 708     const uint8_t * const d128=dither_8x8_220[y&7];
 709     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 710     int i;
 711     int acc = 0;
 712
 713     for (i = 0; i < dstW - 1; i += 2) {
 714         int j;
 715         int Y1 = 1 << 18;
 716         int Y2 = 1 << 18;
 717
 718         for (j = 0; j < lumFilterSize; j++) {
 719             Y1 += lumSrc[j][i]   * lumFilter[j];
 720             Y2 += lumSrc[j][i+1] * lumFilter[j];
 721         }
 722         Y1 >>= 19;
 723         Y2 >>= 19;
 724         if ((Y1 | Y2) & 0x100) {
 725             Y1 = av_clip_uint8(Y1);
 726             Y2 = av_clip_uint8(Y2);
 727         }
 728         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 729         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 730         if ((i & 7) == 6) {
 731             output_pixel(*dest++, acc);
 732         }
 733     }
 734 }
 735
 736 static av_always_inline void
 737 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 738                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 739                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 740                       int yalpha, int uvalpha, int y,
 741                       enum PixelFormat target)
 742 {
 743     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 744     const uint8_t * const d128 = dither_8x8_220[y & 7];
 745     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 746     int  yalpha1 = 4095 - yalpha;
 747     int i;
 748
 749     for (i = 0; i < dstW - 7; i += 8) {
 750         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 751         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 752         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 753         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 754         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 755         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 756         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 757         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 758         output_pixel(*dest++, acc);
 759     }
 760 }
 761
 762 static av_always_inline void
 763 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 764                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 765                       const int16_t *abuf0, uint8_t *dest, int dstW,
 766                       int uvalpha, int y, enum PixelFormat target)
 767 {
 768     const uint8_t * const d128 = dither_8x8_220[y & 7];
 769     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 770     int i;
 771
 772     for (i = 0; i < dstW - 7; i += 8) {
 773         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 774         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 775         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 776         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 777         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 778         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 779         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 780         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 781         output_pixel(*dest++, acc);
 782     }
 783 }
 784
 785 #undef output_pixel
 786
 787 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 788 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 789                                 const int16_t **lumSrc, int lumFilterSize, \
 790                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 791                                 const int16_t **chrVSrc, int chrFilterSize, \
 792                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 793                                 int y) \
 794 { \
 795     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 796                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 797                                   alpSrc, dest, dstW, y, fmt); \
 798 } \
 799  \
 800 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 801                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 802                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 803                                 int yalpha, int uvalpha, int y) \
 804 { \
 805     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 806                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 807 } \
 808  \
 809 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 810                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 811                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 812                                 int uvalpha, int y) \
 813 { \
 814     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 815                                   abuf0, dest, dstW, uvalpha, \
 816                                   y, fmt); \
 817 }
 818
 819 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 820 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 821
 822 #define output_pixels(pos, Y1, U, Y2, V) \
 823     if (target == PIX_FMT_YUYV422) { \
 824         dest[pos + 0] = Y1; \
 825         dest[pos + 1] = U;  \
 826         dest[pos + 2] = Y2; \
 827         dest[pos + 3] = V;  \
 828     } else { \
 829         dest[pos + 0] = U;  \
 830         dest[pos + 1] = Y1; \
 831         dest[pos + 2] = V;  \
 832         dest[pos + 3] = Y2; \
 833     }
 834
 835 static av_always_inline void
 836 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 837                      const int16_t **lumSrc, int lumFilterSize,
 838                      const int16_t *chrFilter, const int16_t **chrUSrc,
 839                      const int16_t **chrVSrc, int chrFilterSize,
 840                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 841                      int y, enum PixelFormat target)
 842 {
 843     int i;
 844
 845     for (i = 0; i < (dstW >> 1); i++) {
 846         int j;
 847         int Y1 = 1 << 18;
 848         int Y2 = 1 << 18;
 849         int U  = 1 << 18;
 850         int V  = 1 << 18;
 851
 852         for (j = 0; j < lumFilterSize; j++) {
 853             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 854             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 855         }
 856         for (j = 0; j < chrFilterSize; j++) {
 857             U += chrUSrc[j][i] * chrFilter[j];
 858             V += chrVSrc[j][i] * chrFilter[j];
 859         }
 860         Y1 >>= 19;
 861         Y2 >>= 19;
 862         U  >>= 19;
 863         V  >>= 19;
 864         if ((Y1 | Y2 | U | V) & 0x100) {
 865             Y1 = av_clip_uint8(Y1);
 866             Y2 = av_clip_uint8(Y2);
 867             U  = av_clip_uint8(U);
 868             V  = av_clip_uint8(V);
 869         }
 870         output_pixels(4*i, Y1, U, Y2, V);
 871     }
 872 }
 873
 874 static av_always_inline void
 875 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 876                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 877                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 878                      int yalpha, int uvalpha, int y,
 879                      enum PixelFormat target)
 880 {
 881     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 882                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 883                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 884     int  yalpha1 = 4095 - yalpha;
 885     int uvalpha1 = 4095 - uvalpha;
 886     int i;
 887
 888     for (i = 0; i < (dstW >> 1); i++) {
 889         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 890         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 891         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 892         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 893
 894         output_pixels(i * 4, Y1, U, Y2, V);
 895     }
 896 }
 897
 898 static av_always_inline void
 899 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 900                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 901                      const int16_t *abuf0, uint8_t *dest, int dstW,
 902                      int uvalpha, int y, enum PixelFormat target)
 903 {
 904     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 905                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 906     int i;
 907
 908     if (uvalpha < 2048) {
 909         for (i = 0; i < (dstW >> 1); i++) {
 910             int Y1 = buf0[i * 2]     >> 7;
 911             int Y2 = buf0[i * 2 + 1] >> 7;
 912             int U  = ubuf1[i]        >> 7;
 913             int V  = vbuf1[i]        >> 7;
 914
 915             output_pixels(i * 4, Y1, U, Y2, V);
 916         }
 917     } else {
 918         for (i = 0; i < (dstW >> 1); i++) {
 919             int Y1 =  buf0[i * 2]          >> 7;
 920             int Y2 =  buf0[i * 2 + 1]      >> 7;
 921             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 922             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 923
 924             output_pixels(i * 4, Y1, U, Y2, V);
 925         }
 926     }
 927 }
 928
 929 #undef output_pixels
 930
 931 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 932 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 933
 934 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 935 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 936 #define output_pixel(pos, val) \
 937     if (isBE(target)) { \
 938         AV_WB16(pos, val); \
 939     } else { \
 940         AV_WL16(pos, val); \
 941     }
 942
 943 static av_always_inline void
 944 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 945                        const int32_t **lumSrc, int lumFilterSize,
 946                        const int16_t *chrFilter, const int32_t **chrUSrc,
 947                        const int32_t **chrVSrc, int chrFilterSize,
 948                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 949                        int y, enum PixelFormat target)
 950 {
 951     int i;
 952
 953     for (i = 0; i < (dstW >> 1); i++) {
 954         int j;
 955         int Y1 = 0;
 956         int Y2 = 0;
 957         int U  = -128 << 23; // 19
 958         int V  = -128 << 23;
 959         int R, G, B;
 960
 961         for (j = 0; j < lumFilterSize; j++) {
 962             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 963             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 964         }
 965         for (j = 0; j < chrFilterSize; j++) {
 966             U += chrUSrc[j][i] * chrFilter[j];
 967             V += chrVSrc[j][i] * chrFilter[j];
 968         }
 969
 970         // 8bit: 12+15=27; 16-bit: 12+19=31
 971         Y1 >>= 14; // 10
 972         Y2 >>= 14;
 973         U  >>= 14;
 974         V  >>= 14;
 975
 976         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 977         Y1 -= c->yuv2rgb_y_offset;
 978         Y2 -= c->yuv2rgb_y_offset;
 979         Y1 *= c->yuv2rgb_y_coeff;
 980         Y2 *= c->yuv2rgb_y_coeff;
 981         Y1 += 1 << 13; // 21
 982         Y2 += 1 << 13;
 983         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 984
 985         R = V * c->yuv2rgb_v2r_coeff;
 986         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 987         B =                            U * c->yuv2rgb_u2b_coeff;
 988
 989         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 990         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 991         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 992         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 993         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 994         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 995         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 996         dest += 6;
 997     }
 998 }
 999
1000 static av_always_inline void
1001 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
1002                        const int32_t *ubuf[2], const int32_t *vbuf[2],
1003                        const int32_t *abuf[2], uint16_t *dest, int dstW,
1004                        int yalpha, int uvalpha, int y,
1005                        enum PixelFormat target)
1006 {
1007     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
1008                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1009                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1010     int  yalpha1 = 4095 - yalpha;
1011     int uvalpha1 = 4095 - uvalpha;
1012     int i;
1013
1014     for (i = 0; i < (dstW >> 1); i++) {
1015         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
1016         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
1017         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
1018         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
1019         int R, G, B;
1020
1021         Y1 -= c->yuv2rgb_y_offset;
1022         Y2 -= c->yuv2rgb_y_offset;
1023         Y1 *= c->yuv2rgb_y_coeff;
1024         Y2 *= c->yuv2rgb_y_coeff;
1025         Y1 += 1 << 13;
1026         Y2 += 1 << 13;
1027
1028         R = V * c->yuv2rgb_v2r_coeff;
1029         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1030         B =                            U * c->yuv2rgb_u2b_coeff;
1031
1032         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1033         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1034         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1035         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1036         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1037         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1038         dest += 6;
1039     }
1040 }
1041
1042 static av_always_inline void
1043 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
1044                        const int32_t *ubuf[2], const int32_t *vbuf[2],
1045                        const int32_t *abuf0, uint16_t *dest, int dstW,
1046                        int uvalpha, int y, enum PixelFormat target)
1047 {
1048     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1049                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1050     int i;
1051
1052     if (uvalpha < 2048) {
1053         for (i = 0; i < (dstW >> 1); i++) {
1054             int Y1 = (buf0[i * 2]    ) >> 2;
1055             int Y2 = (buf0[i * 2 + 1]) >> 2;
1056             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
1057             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
1058             int R, G, B;
1059
1060             Y1 -= c->yuv2rgb_y_offset;
1061             Y2 -= c->yuv2rgb_y_offset;
1062             Y1 *= c->yuv2rgb_y_coeff;
1063             Y2 *= c->yuv2rgb_y_coeff;
1064             Y1 += 1 << 13;
1065             Y2 += 1 << 13;
1066
1067             R = V * c->yuv2rgb_v2r_coeff;
1068             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1069             B =                            U * c->yuv2rgb_u2b_coeff;
1070
1071             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1072             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1073             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1074             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1075             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1076             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1077             dest += 6;
1078         }
1079     } else {
1080         for (i = 0; i < (dstW >> 1); i++) {
1081             int Y1 = (buf0[i * 2]    ) >> 2;
1082             int Y2 = (buf0[i * 2 + 1]) >> 2;
1083             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
1084             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
1085             int R, G, B;
1086
1087             Y1 -= c->yuv2rgb_y_offset;
1088             Y2 -= c->yuv2rgb_y_offset;
1089             Y1 *= c->yuv2rgb_y_coeff;
1090             Y2 *= c->yuv2rgb_y_coeff;
1091             Y1 += 1 << 13;
1092             Y2 += 1 << 13;
1093
1094             R = V * c->yuv2rgb_v2r_coeff;
1095             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1096             B =                            U * c->yuv2rgb_u2b_coeff;
1097
1098             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1099             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1100             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1101             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1102             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1103             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1104             dest += 6;
1105         }
1106     }
1107 }
1108
1109 #undef output_pixel
1110 #undef r_b
1111 #undef b_r
1112
1113 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
1114 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
1115 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
1116 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
1117
1118 static av_always_inline void
1119 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
1120               int U, int V, int A1, int A2,
1121               const void *_r, const void *_g, const void *_b, int y,
1122               enum PixelFormat target, int hasAlpha)
1123 {
1124     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
1125         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1126         uint32_t *dest = (uint32_t *) _dest;
1127         const uint32_t *r = (const uint32_t *) _r;
1128         const uint32_t *g = (const uint32_t *) _g;
1129         const uint32_t *b = (const uint32_t *) _b;
1130
1131 #if CONFIG_SMALL
1132         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1133
1134         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1135         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1136 #else
1137         if (hasAlpha) {
1138             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1139
1140             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1141             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1142         } else {
1143             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1144             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1145         }
1146 #endif
1147     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1148         uint8_t *dest = (uint8_t *) _dest;
1149         const uint8_t *r = (const uint8_t *) _r;
1150         const uint8_t *g = (const uint8_t *) _g;
1151         const uint8_t *b = (const uint8_t *) _b;
1152
1153 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1154 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1155
1156         dest[i * 6 + 0] = r_b[Y1];
1157         dest[i * 6 + 1] =   g[Y1];
1158         dest[i * 6 + 2] = b_r[Y1];
1159         dest[i * 6 + 3] = r_b[Y2];
1160         dest[i * 6 + 4] =   g[Y2];
1161         dest[i * 6 + 5] = b_r[Y2];
1162 #undef r_b
1163 #undef b_r
1164     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1165                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1166                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1167         uint16_t *dest = (uint16_t *) _dest;
1168         const uint16_t *r = (const uint16_t *) _r;
1169         const uint16_t *g = (const uint16_t *) _g;
1170         const uint16_t *b = (const uint16_t *) _b;
1171         int dr1, dg1, db1, dr2, dg2, db2;
1172
1173         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1174             dr1 = dither_2x2_8[ y & 1     ][0];
1175             dg1 = dither_2x2_4[ y & 1     ][0];
1176             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1177             dr2 = dither_2x2_8[ y & 1     ][1];
1178             dg2 = dither_2x2_4[ y & 1     ][1];
1179             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1180         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1181             dr1 = dither_2x2_8[ y & 1     ][0];
1182             dg1 = dither_2x2_8[ y & 1     ][1];
1183             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1184             dr2 = dither_2x2_8[ y & 1     ][1];
1185             dg2 = dither_2x2_8[ y & 1     ][0];
1186             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1187         } else {
1188             dr1 = dither_4x4_16[ y & 3     ][0];
1189             dg1 = dither_4x4_16[ y & 3     ][1];
1190             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1191             dr2 = dither_4x4_16[ y & 3     ][1];
1192             dg2 = dither_4x4_16[ y & 3     ][0];
1193             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1194         }
1195
1196         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1197         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1198     } else /* 8/4-bit */ {
1199         uint8_t *dest = (uint8_t *) _dest;
1200         const uint8_t *r = (const uint8_t *) _r;
1201         const uint8_t *g = (const uint8_t *) _g;
1202         const uint8_t *b = (const uint8_t *) _b;
1203         int dr1, dg1, db1, dr2, dg2, db2;
1204
1205         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1206             const uint8_t * const d64 = dither_8x8_73[y & 7];
1207             const uint8_t * const d32 = dither_8x8_32[y & 7];
1208             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1209             db1 =       d64[(i * 2 + 0) & 7];
1210             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1211             db2 =       d64[(i * 2 + 1) & 7];
1212         } else {
1213             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1214             const uint8_t * const d128 = dither_8x8_220[y & 7];
1215             dr1 = db1 = d128[(i * 2 + 0) & 7];
1216             dg1 =        d64[(i * 2 + 0) & 7];
1217             dr2 = db2 = d128[(i * 2 + 1) & 7];
1218             dg2 =        d64[(i * 2 + 1) & 7];
1219         }
1220
1221         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1222             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1223                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1224         } else {
1225             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1226             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1227         }
1228     }
1229 }
1230
1231 static av_always_inline void
1232 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1233                      const int16_t **lumSrc, int lumFilterSize,
1234                      const int16_t *chrFilter, const int16_t **chrUSrc,
1235                      const int16_t **chrVSrc, int chrFilterSize,
1236                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1237                      int y, enum PixelFormat target, int hasAlpha)
1238 {
1239     int i;
1240
1241     for (i = 0; i < (dstW >> 1); i++) {
1242         int j;
1243         int Y1 = 1 << 18;
1244         int Y2 = 1 << 18;
1245         int U  = 1 << 18;
1246         int V  = 1 << 18;
1247         int av_unused A1, A2;
1248         const void *r, *g, *b;
1249
1250         for (j = 0; j < lumFilterSize; j++) {
1251             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1252             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1253         }
1254         for (j = 0; j < chrFilterSize; j++) {
1255             U += chrUSrc[j][i] * chrFilter[j];
1256             V += chrVSrc[j][i] * chrFilter[j];
1257         }
1258         Y1 >>= 19;
1259         Y2 >>= 19;
1260         U  >>= 19;
1261         V  >>= 19;
1262         if ((Y1 | Y2 | U | V) & 0x100) {
1263             Y1 = av_clip_uint8(Y1);
1264             Y2 = av_clip_uint8(Y2);
1265             U  = av_clip_uint8(U);
1266             V  = av_clip_uint8(V);
1267         }
1268         if (hasAlpha) {
1269             A1 = 1 << 18;
1270             A2 = 1 << 18;
1271             for (j = 0; j < lumFilterSize; j++) {
1272                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1273                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1274             }
1275             A1 >>= 19;
1276             A2 >>= 19;
1277             if ((A1 | A2) & 0x100) {
1278                 A1 = av_clip_uint8(A1);
1279                 A2 = av_clip_uint8(A2);
1280             }
1281         }
1282
1283         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1284         r =  c->table_rV[V];
1285         g = (c->table_gU[U] + c->table_gV[V]);
1286         b =  c->table_bU[U];
1287
1288         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1289                       r, g, b, y, target, hasAlpha);
1290     }
1291 }
1292
1293 static av_always_inline void
1294 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1295                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1296                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1297                      int yalpha, int uvalpha, int y,
1298                      enum PixelFormat target, int hasAlpha)
1299 {
1300     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1301                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1302                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1303                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1304                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1305     int  yalpha1 = 4095 - yalpha;
1306     int uvalpha1 = 4095 - uvalpha;
1307     int i;
1308
1309     for (i = 0; i < (dstW >> 1); i++) {
1310         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1311         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1312         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1313         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1314         int A1, A2;
1315         const void *r =  c->table_rV[V],
1316                    *g = (c->table_gU[U] + c->table_gV[V]),
1317                    *b =  c->table_bU[U];
1318
1319         if (hasAlpha) {
1320             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1321             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1322         }
1323
1324         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1325                       r, g, b, y, target, hasAlpha);
1326     }
1327 }
1328
1329 static av_always_inline void
1330 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1331                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1332                      const int16_t *abuf0, uint8_t *dest, int dstW,
1333                      int uvalpha, int y, enum PixelFormat target,
1334                      int hasAlpha)
1335 {
1336     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1337                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1338     int i;
1339
1340     if (uvalpha < 2048) {
1341         for (i = 0; i < (dstW >> 1); i++) {
1342             int Y1 = buf0[i * 2]     >> 7;
1343             int Y2 = buf0[i * 2 + 1] >> 7;
1344             int U  = ubuf1[i]        >> 7;
1345             int V  = vbuf1[i]        >> 7;
1346             int A1, A2;
1347             const void *r =  c->table_rV[V],
1348                        *g = (c->table_gU[U] + c->table_gV[V]),
1349                        *b =  c->table_bU[U];
1350
1351             if (hasAlpha) {
1352                 A1 = abuf0[i * 2    ] >> 7;
1353                 A2 = abuf0[i * 2 + 1] >> 7;
1354             }
1355
1356             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1357                           r, g, b, y, target, hasAlpha);
1358         }
1359     } else {
1360         for (i = 0; i < (dstW >> 1); i++) {
1361             int Y1 =  buf0[i * 2]          >> 7;
1362             int Y2 =  buf0[i * 2 + 1]      >> 7;
1363             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1364             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1365             int A1, A2;
1366             const void *r =  c->table_rV[V],
1367                        *g = (c->table_gU[U] + c->table_gV[V]),
1368                        *b =  c->table_bU[U];
1369
1370             if (hasAlpha) {
1371                 A1 = abuf0[i * 2    ] >> 7;
1372                 A2 = abuf0[i * 2 + 1] >> 7;
1373             }
1374
1375             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1376                           r, g, b, y, target, hasAlpha);
1377         }
1378     }
1379 }
1380
1381 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1382 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1383                                 const int16_t **lumSrc, int lumFilterSize, \
1384                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1385                                 const int16_t **chrVSrc, int chrFilterSize, \
1386                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1387                                 int y) \
1388 { \
1389     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1390                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1391                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1392 }
1393 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1394 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1395 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1396                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1397                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1398                                 int yalpha, int uvalpha, int y) \
1399 { \
1400     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1401                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1402 } \
1403  \
1404 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1405                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1406                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1407                                 int uvalpha, int y) \
1408 { \
1409     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1410                                   dstW, uvalpha, y, fmt, hasAlpha); \
1411 }
1412
1413 #if CONFIG_SMALL
1414 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1415 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1416 #else
1417 #if CONFIG_SWSCALE_ALPHA
1418 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1419 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1420 #endif
1421 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1422 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1423 #endif
1424 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1425 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1426 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1427 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1428 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1429 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1430 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1431 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1432
1433 static av_always_inline void
1434 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1435                           const int16_t **lumSrc, int lumFilterSize,
1436                           const int16_t *chrFilter, const int16_t **chrUSrc,
1437                           const int16_t **chrVSrc, int chrFilterSize,
1438                           const int16_t **alpSrc, uint8_t *dest,
1439                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1440 {
1441     int i;
1442     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1443
1444     for (i = 0; i < dstW; i++) {
1445         int j;
1446         int Y = 1<<9;
1447         int U = (1<<9)-(128 << 19);
1448         int V = (1<<9)-(128 << 19);
1449         int av_unused A;
1450         int R, G, B;
1451
1452         for (j = 0; j < lumFilterSize; j++) {
1453             Y += lumSrc[j][i] * lumFilter[j];
1454         }
1455         for (j = 0; j < chrFilterSize; j++) {
1456             U += chrUSrc[j][i] * chrFilter[j];
1457             V += chrVSrc[j][i] * chrFilter[j];
1458         }
1459         Y >>= 10;
1460         U >>= 10;
1461         V >>= 10;
1462         if (hasAlpha) {
1463             A = 1 << 18;
1464             for (j = 0; j < lumFilterSize; j++) {
1465                 A += alpSrc[j][i] * lumFilter[j];
1466             }
1467             A >>= 19;
1468             if (A & 0x100)
1469                 A = av_clip_uint8(A);
1470         }
1471         Y -= c->yuv2rgb_y_offset;
1472         Y *= c->yuv2rgb_y_coeff;
1473         Y += 1 << 21;
1474         R = Y + V*c->yuv2rgb_v2r_coeff;
1475         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1476         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1477         if ((R | G | B) & 0xC0000000) {
1478             R = av_clip_uintp2(R, 30);
1479             G = av_clip_uintp2(G, 30);
1480             B = av_clip_uintp2(B, 30);
1481         }
1482
1483         switch(target) {
1484         case PIX_FMT_ARGB:
1485             dest[0] = hasAlpha ? A : 255;
1486             dest[1] = R >> 22;
1487             dest[2] = G >> 22;
1488             dest[3] = B >> 22;
1489             break;
1490         case PIX_FMT_RGB24:
1491             dest[0] = R >> 22;
1492             dest[1] = G >> 22;
1493             dest[2] = B >> 22;
1494             break;
1495         case PIX_FMT_RGBA:
1496             dest[0] = R >> 22;
1497             dest[1] = G >> 22;
1498             dest[2] = B >> 22;
1499             dest[3] = hasAlpha ? A : 255;
1500             break;
1501         case PIX_FMT_ABGR:
1502             dest[0] = hasAlpha ? A : 255;
1503             dest[1] = B >> 22;
1504             dest[2] = G >> 22;
1505             dest[3] = R >> 22;
1506             break;
1507         case PIX_FMT_BGR24:
1508             dest[0] = B >> 22;
1509             dest[1] = G >> 22;
1510             dest[2] = R >> 22;
1511             break;
1512         case PIX_FMT_BGRA:
1513             dest[0] = B >> 22;
1514             dest[1] = G >> 22;
1515             dest[2] = R >> 22;
1516             dest[3] = hasAlpha ? A : 255;
1517             break;
1518         }
1519         dest += step;
1520     }
1521 }
1522
1523 #if CONFIG_SMALL
1524 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1525 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1526 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1527 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1528 #else
1529 #if CONFIG_SWSCALE_ALPHA
1530 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1531 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1532 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1533 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1534 #endif
1535 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1536 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1537 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1538 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1539 #endif
1540 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1541 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1542
1543 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1544                                        int width, int height,
1545                                        int y, uint8_t val)
1546 {
1547     int i;
1548     uint8_t *ptr = plane + stride*y;
1549     for (i=0; i<height; i++) {
1550         memset(ptr, val, width);
1551         ptr += stride;
1552     }
1553 }
1554
1555 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1556
1557 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1558 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1559
1560 static av_always_inline void
1561 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1562                     enum PixelFormat origin)
1563 {
1564     int i;
1565     for (i = 0; i < width; i++) {
1566         unsigned int r_b = input_pixel(&src[i*3+0]);
1567         unsigned int   g = input_pixel(&src[i*3+1]);
1568         unsigned int b_r = input_pixel(&src[i*3+2]);
1569
1570         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1571     }
1572 }
1573
1574 static av_always_inline void
1575 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1576                     const uint16_t *src1, const uint16_t *src2,
1577                     int width, enum PixelFormat origin)
1578 {
1579     int i;
1580     assert(src1==src2);
1581     for (i = 0; i < width; i++) {
1582         int r_b = input_pixel(&src1[i*3+0]);
1583         int   g = input_pixel(&src1[i*3+1]);
1584         int b_r = input_pixel(&src1[i*3+2]);
1585
1586         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1587         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1588     }
1589 }
1590
1591 static av_always_inline void
1592 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1593                           const uint16_t *src1, const uint16_t *src2,
1594                           int width, enum PixelFormat origin)
1595 {
1596     int i;
1597     assert(src1==src2);
1598     for (i = 0; i < width; i++) {
1599         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1600         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1601         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1602
1603         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1604         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1605     }
1606 }
1607
1608 #undef r
1609 #undef b
1610 #undef input_pixel
1611
1612 #define rgb48funcs(pattern, BE_LE, origin) \
1613 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1614                                     int width, uint32_t *unused) \
1615 { \
1616     const uint16_t *src = (const uint16_t *) _src; \
1617     uint16_t *dst = (uint16_t *) _dst; \
1618     rgb48ToY_c_template(dst, src, width, origin); \
1619 } \
1620  \
1621 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1622                                     const uint8_t *_src1, const uint8_t *_src2, \
1623                                     int width, uint32_t *unused) \
1624 { \
1625     const uint16_t *src1 = (const uint16_t *) _src1, \
1626                    *src2 = (const uint16_t *) _src2; \
1627     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1628     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1629 } \
1630  \
1631 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1632                                     const uint8_t *_src1, const uint8_t *_src2, \
1633                                     int width, uint32_t *unused) \
1634 { \
1635     const uint16_t *src1 = (const uint16_t *) _src1, \
1636                    *src2 = (const uint16_t *) _src2; \
1637     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1638     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1639 }
1640
1641 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1642 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1643 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1644 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1645
1646 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1647                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1648                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1649
1650 static av_always_inline void
1651 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1652                        int width, enum PixelFormat origin,
1653                        int shr,   int shg,   int shb, int shp,
1654                        int maskr, int maskg, int maskb,
1655                        int rsh,   int gsh,   int bsh, int S)
1656 {
1657     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1658               rnd = (32<<((S)-1)) + (1<<(S-7));
1659     int i;
1660
1661     for (i = 0; i < width; i++) {
1662         int px = input_pixel(i) >> shp;
1663         int b = (px & maskb) >> shb;
1664         int g = (px & maskg) >> shg;
1665         int r = (px & maskr) >> shr;
1666
1667         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1668     }
1669 }
1670
1671 static av_always_inline void
1672 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1673                         const uint8_t *src, int width,
1674                         enum PixelFormat origin,
1675                         int shr,   int shg,   int shb, int shp,
1676                         int maskr, int maskg, int maskb,
1677                         int rsh,   int gsh,   int bsh, int S)
1678 {
1679     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1680               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1681               rnd = (256<<((S)-1)) + (1<<(S-7));
1682     int i;
1683
1684     for (i = 0; i < width; i++) {
1685         int px = input_pixel(i) >> shp;
1686         int b = (px & maskb) >> shb;
1687         int g = (px & maskg) >> shg;
1688         int r = (px & maskr) >> shr;
1689
1690         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1691         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1692     }
1693 }
1694
1695 static av_always_inline void
1696 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1697                              const uint8_t *src, int width,
1698                              enum PixelFormat origin,
1699                              int shr,   int shg,   int shb, int shp,
1700                              int maskr, int maskg, int maskb,
1701                              int rsh,   int gsh,   int bsh, int S)
1702 {
1703     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1704               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1705               rnd = (256U<<(S)) + (1<<(S-6)), maskgx = ~(maskr | maskb);
1706     int i;
1707
1708     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1709     for (i = 0; i < width; i++) {
1710         int px0 = input_pixel(2 * i + 0) >> shp;
1711         int px1 = input_pixel(2 * i + 1) >> shp;
1712         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1713         int rb = px0 + px1 - g;
1714
1715         b = (rb & maskb) >> shb;
1716         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1717             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1718             g >>= shg;
1719         } else {
1720             g = (g  & maskg) >> shg;
1721         }
1722         r = (rb & maskr) >> shr;
1723
1724         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1725         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1726     }
1727 }
1728
1729 #undef input_pixel
1730
1731 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1732                          maskg, maskb, rsh, gsh, bsh, S) \
1733 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1734                           int width, uint32_t *unused) \
1735 { \
1736     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1737                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1738 } \
1739  \
1740 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1741                            const uint8_t *src, const uint8_t *dummy, \
1742                            int width, uint32_t *unused) \
1743 { \
1744     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1745                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1746 } \
1747  \
1748 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1749                                 const uint8_t *src, const uint8_t *dummy, \
1750                                 int width, uint32_t *unused) \
1751 { \
1752     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1753                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1754 }
1755
1756 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1757 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1758 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1759 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1760 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1761 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1762 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1763 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1764 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1765 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1766 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1767 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1768
1769 static void abgrToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1770 {
1771     int i;
1772     for (i=0; i<width; i++) {
1773         dst[i]= src[4*i]<<6;
1774     }
1775 }
1776
1777 static void rgbaToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1778 {
1779     int i;
1780     for (i=0; i<width; i++) {
1781         dst[i]= src[4*i+3]<<6;
1782     }
1783 }
1784
1785 static void palToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *pal)
1786 {
1787     int i;
1788     for (i=0; i<width; i++) {
1789         int d= src[i];
1790
1791         dst[i]= (pal[d] >> 24)<<6;
1792     }
1793 }
1794
1795 static void palToY_c(int16_t *dst, const uint8_t *src, long width, uint32_t *pal)
1796 {
1797     int i;
1798     for (i=0; i<width; i++) {
1799         int d= src[i];
1800
1801         dst[i]= (pal[d] & 0xFF)<<6;
1802     }
1803 }
1804
1805 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1806                            const uint8_t *src1, const uint8_t *src2,
1807                            int width, uint32_t *pal)
1808 {
1809     int i;
1810     assert(src1 == src2);
1811     for (i=0; i<width; i++) {
1812         int p= pal[src1[i]];
1813
1814         dstU[i]= (uint8_t)(p>> 8)<<6;
1815         dstV[i]= (uint8_t)(p>>16)<<6;
1816     }
1817 }
1818
1819 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1820 {
1821     int i, j;
1822     for (i=0; i<width/8; i++) {
1823         int d= ~src[i];
1824         for(j=0; j<8; j++)
1825             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1826     }
1827     if(width&7){
1828         int d= ~src[i];
1829         for(j=0; j<(width&7); j++)
1830             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1831     }
1832 }
1833
1834 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1835 {
1836     int i, j;
1837     for (i=0; i<width/8; i++) {
1838         int d= src[i];
1839         for(j=0; j<8; j++)
1840             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1841     }
1842     if(width&7){
1843         int d= src[i];
1844         for(j=0; j<(width&7); j++)
1845             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1846     }
1847 }
1848
1849 //FIXME yuy2* can read up to 7 samples too much
1850
1851 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1852                       uint32_t *unused)
1853 {
1854     int i;
1855     for (i=0; i<width; i++)
1856         dst[i]= src[2*i];
1857 }
1858
1859 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1860                        const uint8_t *src2, int width, uint32_t *unused)
1861 {
1862     int i;
1863     for (i=0; i<width; i++) {
1864         dstU[i]= src1[4*i + 1];
1865         dstV[i]= src1[4*i + 3];
1866     }
1867     assert(src1 == src2);
1868 }
1869
1870 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1871 {
1872     int i;
1873     const uint16_t *src = (const uint16_t *) _src;
1874     uint16_t *dst = (uint16_t *) _dst;
1875     for (i=0; i<width; i++) {
1876         dst[i] = av_bswap16(src[i]);
1877     }
1878 }
1879
1880 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1881                         const uint8_t *_src2, int width, uint32_t *unused)
1882 {
1883     int i;
1884     const uint16_t *src1 = (const uint16_t *) _src1,
1885                    *src2 = (const uint16_t *) _src2;
1886     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1887     for (i=0; i<width; i++) {
1888         dstU[i] = av_bswap16(src1[i]);
1889         dstV[i] = av_bswap16(src2[i]);
1890     }
1891 }
1892
1893 /* This is almost identical to the previous, end exists only because
1894  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1895 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1896                       uint32_t *unused)
1897 {
1898     int i;
1899     for (i=0; i<width; i++)
1900         dst[i]= src[2*i+1];
1901 }
1902
1903 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1904                        const uint8_t *src2, int width, uint32_t *unused)
1905 {
1906     int i;
1907     for (i=0; i<width; i++) {
1908         dstU[i]= src1[4*i + 0];
1909         dstV[i]= src1[4*i + 2];
1910     }
1911     assert(src1 == src2);
1912 }
1913
1914 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1915                                         const uint8_t *src, int width)
1916 {
1917     int i;
1918     for (i = 0; i < width; i++) {
1919         dst1[i] = src[2*i+0];
1920         dst2[i] = src[2*i+1];
1921     }
1922 }
1923
1924 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1925                        const uint8_t *src1, const uint8_t *src2,
1926                        int width, uint32_t *unused)
1927 {
1928     nvXXtoUV_c(dstU, dstV, src1, width);
1929 }
1930
1931 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1932                        const uint8_t *src1, const uint8_t *src2,
1933                        int width, uint32_t *unused)
1934 {
1935     nvXXtoUV_c(dstV, dstU, src1, width);
1936 }
1937
1938 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1939
1940 static void bgr24ToY_c(int16_t *dst, const uint8_t *src,
1941                        int width, uint32_t *unused)
1942 {
1943     int i;
1944     for (i=0; i<width; i++) {
1945         int b= src[i*3+0];
1946         int g= src[i*3+1];
1947         int r= src[i*3+2];
1948
1949         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1950     }
1951 }
1952
1953 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1954                         const uint8_t *src2, int width, uint32_t *unused)
1955 {
1956     int i;
1957     for (i=0; i<width; i++) {
1958         int b= src1[3*i + 0];
1959         int g= src1[3*i + 1];
1960         int r= src1[3*i + 2];
1961
1962         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1963         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1964     }
1965     assert(src1 == src2);
1966 }
1967
1968 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1969                              const uint8_t *src2, int width, uint32_t *unused)
1970 {
1971     int i;
1972     for (i=0; i<width; i++) {
1973         int b= src1[6*i + 0] + src1[6*i + 3];
1974         int g= src1[6*i + 1] + src1[6*i + 4];
1975         int r= src1[6*i + 2] + src1[6*i + 5];
1976
1977         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1978         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1979     }
1980     assert(src1 == src2);
1981 }
1982
1983 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, int width,
1984                        uint32_t *unused)
1985 {
1986     int i;
1987     for (i=0; i<width; i++) {
1988         int r= src[i*3+0];
1989         int g= src[i*3+1];
1990         int b= src[i*3+2];
1991
1992         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1993     }
1994 }
1995
1996 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1997                         const uint8_t *src2, int width, uint32_t *unused)
1998 {
1999     int i;
2000     assert(src1==src2);
2001     for (i=0; i<width; i++) {
2002         int r= src1[3*i + 0];
2003         int g= src1[3*i + 1];
2004         int b= src1[3*i + 2];
2005
2006         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
2007         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
2008     }
2009 }
2010
2011 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
2012                                     const uint8_t *src2, int width, uint32_t *unused)
2013 {
2014     int i;
2015     assert(src1==src2);
2016     for (i=0; i<width; i++) {
2017         int r= src1[6*i + 0] + src1[6*i + 3];
2018         int g= src1[6*i + 1] + src1[6*i + 4];
2019         int b= src1[6*i + 2] + src1[6*i + 5];
2020
2021         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
2022         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
2023     }
2024 }
2025
2026 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
2027                            const int16_t *filter,
2028                            const int16_t *filterPos, int filterSize)
2029 {
2030     int i;
2031     int32_t *dst = (int32_t *) _dst;
2032     const uint16_t *src = (const uint16_t *) _src;
2033     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2034     int sh = bits - 4;
2035
2036     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2037         sh= 9;
2038
2039     for (i = 0; i < dstW; i++) {
2040         int j;
2041         int srcPos = filterPos[i];
2042         int val = 0;
2043
2044         for (j = 0; j < filterSize; j++) {
2045             val += src[srcPos + j] * filter[filterSize * i + j];
2046         }
2047         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
2048         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
2049     }
2050 }
2051
2052 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
2053                            const int16_t *filter,
2054                            const int16_t *filterPos, int filterSize)
2055 {
2056     int i;
2057     const uint16_t *src = (const uint16_t *) _src;
2058     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2059
2060     if(sh<15)
2061         sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2062
2063     for (i = 0; i < dstW; i++) {
2064         int j;
2065         int srcPos = filterPos[i];
2066         int val = 0;
2067
2068         for (j = 0; j < filterSize; j++) {
2069             val += src[srcPos + j] * filter[filterSize * i + j];
2070         }
2071         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2072         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2073     }
2074 }
2075
2076 // bilinear / bicubic scaling
2077 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2078                           const int16_t *filter, const int16_t *filterPos,
2079                           int filterSize)
2080 {
2081     int i;
2082     for (i=0; i<dstW; i++) {
2083         int j;
2084         int srcPos= filterPos[i];
2085         int val=0;
2086         for (j=0; j<filterSize; j++) {
2087             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2088         }
2089         //filter += hFilterSize;
2090         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2091         //dst[i] = val>>7;
2092     }
2093 }
2094
2095 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
2096                           const int16_t *filter, const int16_t *filterPos,
2097                           int filterSize)
2098 {
2099     int i;
2100     int32_t *dst = (int32_t *) _dst;
2101     for (i=0; i<dstW; i++) {
2102         int j;
2103         int srcPos= filterPos[i];
2104         int val=0;
2105         for (j=0; j<filterSize; j++) {
2106             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2107         }
2108         //filter += hFilterSize;
2109         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
2110         //dst[i] = val>>7;
2111     }
2112 }
2113
2114 //FIXME all pal and rgb srcFormats could do this convertion as well
2115 //FIXME all scalers more complex than bilinear could do half of this transform
2116 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2117 {
2118     int i;
2119     for (i = 0; i < width; i++) {
2120         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2121         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2122     }
2123 }
2124 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2125 {
2126     int i;
2127     for (i = 0; i < width; i++) {
2128         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2129         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2130     }
2131 }
2132 static void lumRangeToJpeg_c(int16_t *dst, int width)
2133 {
2134     int i;
2135     for (i = 0; i < width; i++)
2136         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2137 }
2138 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2139 {
2140     int i;
2141     for (i = 0; i < width; i++)
2142         dst[i] = (dst[i]*14071 + 33561947)>>14;
2143 }
2144
2145 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2146 {
2147     int i;
2148     int32_t *dstU = (int32_t *) _dstU;
2149     int32_t *dstV = (int32_t *) _dstV;
2150     for (i = 0; i < width; i++) {
2151         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2152         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2153     }
2154 }
2155 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2156 {
2157     int i;
2158     int32_t *dstU = (int32_t *) _dstU;
2159     int32_t *dstV = (int32_t *) _dstV;
2160     for (i = 0; i < width; i++) {
2161         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2162         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2163     }
2164 }
2165 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2166 {
2167     int i;
2168     int32_t *dst = (int32_t *) _dst;
2169     for (i = 0; i < width; i++)
2170         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2171 }
2172 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2173 {
2174     int i;
2175     int32_t *dst = (int32_t *) _dst;
2176     for (i = 0; i < width; i++)
2177         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2178 }
2179
2180 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2181                            const uint8_t *src, int srcW, int xInc)
2182 {
2183     int i;
2184     unsigned int xpos=0;
2185     for (i=0;i<dstWidth;i++) {
2186         register unsigned int xx=xpos>>16;
2187         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2188         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2189         xpos+=xInc;
2190     }
2191     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2192         dst[i] = src[srcW-1]*128;
2193 }
2194
2195 // *** horizontal scale Y line to temp buffer
2196 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2197                                      const uint8_t *src, int srcW, int xInc,
2198                                      const int16_t *hLumFilter,
2199                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2200                                      uint8_t *formatConvBuffer,
2201                                      uint32_t *pal, int isAlpha)
2202 {
2203     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2204     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2205
2206     if (toYV12) {
2207         toYV12(formatConvBuffer, src, srcW, pal);
2208         src= formatConvBuffer;
2209     }
2210
2211     if (c->hScale16) {
2212         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2213         c->hScale16(dst, dstWidth, (const uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, shift);
2214     } else if (!c->hyscale_fast) {
2215         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2216     } else { // fast bilinear upscale / crap downscale
2217         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2218     }
2219
2220     if (convertRange)
2221         convertRange(dst, dstWidth);
2222 }
2223
2224 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2225                            int dstWidth, const uint8_t *src1,
2226                            const uint8_t *src2, int srcW, int xInc)
2227 {
2228     int i;
2229     unsigned int xpos=0;
2230     for (i=0;i<dstWidth;i++) {
2231         register unsigned int xx=xpos>>16;
2232         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2233         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2234         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2235         xpos+=xInc;
2236     }
2237     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2238         dst1[i] = src1[srcW-1]*128;
2239         dst2[i] = src2[srcW-1]*128;
2240     }
2241 }
2242
2243 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2244                                      const uint8_t *src1, const uint8_t *src2,
2245                                      int srcW, int xInc, const int16_t *hChrFilter,
2246                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2247                                      uint8_t *formatConvBuffer, uint32_t *pal)
2248 {
2249     if (c->chrToYV12) {
2250         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2251         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2252         src1= formatConvBuffer;
2253         src2= buf2;
2254     }
2255
2256     if (c->hScale16) {
2257         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2258         c->hScale16(dst1, dstWidth, (const uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2259         c->hScale16(dst2, dstWidth, (const uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2260     } else if (!c->hcscale_fast) {
2261         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2262         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2263     } else { // fast bilinear upscale / crap downscale
2264         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2265     }
2266
2267     if (c->chrConvertRange)
2268         c->chrConvertRange(dst1, dst2, dstWidth);
2269 }
2270
2271 static av_always_inline void
2272 find_c_packed_planar_out_funcs(SwsContext *c,
2273                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2274                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2275                                yuv2packedX_fn *yuv2packedX)
2276 {
2277     enum PixelFormat dstFormat = c->dstFormat;
2278
2279     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2280         *yuv2yuvX     = yuv2nv12X_c;
2281     } else if (is16BPS(dstFormat)) {
2282         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2283     } else if (is9_OR_10BPS(dstFormat)) {
2284         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2285             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2286         } else {
2287             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2288         }
2289     } else {
2290         *yuv2yuv1     = yuv2yuv1_c;
2291         *yuv2yuvX     = yuv2yuvX_c;
2292     }
2293     if(c->flags & SWS_FULL_CHR_H_INT) {
2294         switch (dstFormat) {
2295             case PIX_FMT_RGBA:
2296 #if CONFIG_SMALL
2297                 *yuv2packedX = yuv2rgba32_full_X_c;
2298 #else
2299 #if CONFIG_SWSCALE_ALPHA
2300                 if (c->alpPixBuf) {
2301                     *yuv2packedX = yuv2rgba32_full_X_c;
2302                 } else
2303 #endif /* CONFIG_SWSCALE_ALPHA */
2304                 {
2305                     *yuv2packedX = yuv2rgbx32_full_X_c;
2306                 }
2307 #endif /* !CONFIG_SMALL */
2308                 break;
2309             case PIX_FMT_ARGB:
2310 #if CONFIG_SMALL
2311                 *yuv2packedX = yuv2argb32_full_X_c;
2312 #else
2313 #if CONFIG_SWSCALE_ALPHA
2314                 if (c->alpPixBuf) {
2315                     *yuv2packedX = yuv2argb32_full_X_c;
2316                 } else
2317 #endif /* CONFIG_SWSCALE_ALPHA */
2318                 {
2319                     *yuv2packedX = yuv2xrgb32_full_X_c;
2320                 }
2321 #endif /* !CONFIG_SMALL */
2322                 break;
2323             case PIX_FMT_BGRA:
2324 #if CONFIG_SMALL
2325                 *yuv2packedX = yuv2bgra32_full_X_c;
2326 #else
2327 #if CONFIG_SWSCALE_ALPHA
2328                 if (c->alpPixBuf) {
2329                     *yuv2packedX = yuv2bgra32_full_X_c;
2330                 } else
2331 #endif /* CONFIG_SWSCALE_ALPHA */
2332                 {
2333                     *yuv2packedX = yuv2bgrx32_full_X_c;
2334                 }
2335 #endif /* !CONFIG_SMALL */
2336                 break;
2337             case PIX_FMT_ABGR:
2338 #if CONFIG_SMALL
2339                 *yuv2packedX = yuv2abgr32_full_X_c;
2340 #else
2341 #if CONFIG_SWSCALE_ALPHA
2342                 if (c->alpPixBuf) {
2343                     *yuv2packedX = yuv2abgr32_full_X_c;
2344                 } else
2345 #endif /* CONFIG_SWSCALE_ALPHA */
2346                 {
2347                     *yuv2packedX = yuv2xbgr32_full_X_c;
2348                 }
2349 #endif /* !CONFIG_SMALL */
2350                 break;
2351             case PIX_FMT_RGB24:
2352             *yuv2packedX = yuv2rgb24_full_X_c;
2353             break;
2354         case PIX_FMT_BGR24:
2355             *yuv2packedX = yuv2bgr24_full_X_c;
2356             break;
2357         }
2358         if(!*yuv2packedX)
2359             goto YUV_PACKED;
2360     } else {
2361         YUV_PACKED:
2362         switch (dstFormat) {
2363         case PIX_FMT_GRAY16BE:
2364             *yuv2packed1 = yuv2gray16BE_1_c;
2365             *yuv2packed2 = yuv2gray16BE_2_c;
2366             *yuv2packedX = yuv2gray16BE_X_c;
2367             break;
2368         case PIX_FMT_GRAY16LE:
2369             *yuv2packed1 = yuv2gray16LE_1_c;
2370             *yuv2packed2 = yuv2gray16LE_2_c;
2371             *yuv2packedX = yuv2gray16LE_X_c;
2372             break;
2373         case PIX_FMT_MONOWHITE:
2374             *yuv2packed1 = yuv2monowhite_1_c;
2375             *yuv2packed2 = yuv2monowhite_2_c;
2376             *yuv2packedX = yuv2monowhite_X_c;
2377             break;
2378         case PIX_FMT_MONOBLACK:
2379             *yuv2packed1 = yuv2monoblack_1_c;
2380             *yuv2packed2 = yuv2monoblack_2_c;
2381             *yuv2packedX = yuv2monoblack_X_c;
2382             break;
2383         case PIX_FMT_YUYV422:
2384             *yuv2packed1 = yuv2yuyv422_1_c;
2385             *yuv2packed2 = yuv2yuyv422_2_c;
2386             *yuv2packedX = yuv2yuyv422_X_c;
2387             break;
2388         case PIX_FMT_UYVY422:
2389             *yuv2packed1 = yuv2uyvy422_1_c;
2390             *yuv2packed2 = yuv2uyvy422_2_c;
2391             *yuv2packedX = yuv2uyvy422_X_c;
2392             break;
2393         case PIX_FMT_RGB48LE:
2394             *yuv2packed1 = yuv2rgb48le_1_c;
2395             *yuv2packed2 = yuv2rgb48le_2_c;
2396             *yuv2packedX = yuv2rgb48le_X_c;
2397             break;
2398         case PIX_FMT_RGB48BE:
2399             *yuv2packed1 = yuv2rgb48be_1_c;
2400             *yuv2packed2 = yuv2rgb48be_2_c;
2401             *yuv2packedX = yuv2rgb48be_X_c;
2402             break;
2403         case PIX_FMT_BGR48LE:
2404             *yuv2packed1 = yuv2bgr48le_1_c;
2405             *yuv2packed2 = yuv2bgr48le_2_c;
2406             *yuv2packedX = yuv2bgr48le_X_c;
2407             break;
2408         case PIX_FMT_BGR48BE:
2409             *yuv2packed1 = yuv2bgr48be_1_c;
2410             *yuv2packed2 = yuv2bgr48be_2_c;
2411             *yuv2packedX = yuv2bgr48be_X_c;
2412             break;
2413         case PIX_FMT_RGB32:
2414         case PIX_FMT_BGR32:
2415 #if CONFIG_SMALL
2416             *yuv2packed1 = yuv2rgb32_1_c;
2417             *yuv2packed2 = yuv2rgb32_2_c;
2418             *yuv2packedX = yuv2rgb32_X_c;
2419 #else
2420 #if CONFIG_SWSCALE_ALPHA
2421                 if (c->alpPixBuf) {
2422                     *yuv2packed1 = yuv2rgba32_1_c;
2423                     *yuv2packed2 = yuv2rgba32_2_c;
2424                     *yuv2packedX = yuv2rgba32_X_c;
2425                 } else
2426 #endif /* CONFIG_SWSCALE_ALPHA */
2427                 {
2428                     *yuv2packed1 = yuv2rgbx32_1_c;
2429                     *yuv2packed2 = yuv2rgbx32_2_c;
2430                     *yuv2packedX = yuv2rgbx32_X_c;
2431                 }
2432 #endif /* !CONFIG_SMALL */
2433             break;
2434         case PIX_FMT_RGB32_1:
2435         case PIX_FMT_BGR32_1:
2436 #if CONFIG_SMALL
2437                 *yuv2packed1 = yuv2rgb32_1_1_c;
2438                 *yuv2packed2 = yuv2rgb32_1_2_c;
2439                 *yuv2packedX = yuv2rgb32_1_X_c;
2440 #else
2441 #if CONFIG_SWSCALE_ALPHA
2442                 if (c->alpPixBuf) {
2443                     *yuv2packed1 = yuv2rgba32_1_1_c;
2444                     *yuv2packed2 = yuv2rgba32_1_2_c;
2445                     *yuv2packedX = yuv2rgba32_1_X_c;
2446                 } else
2447 #endif /* CONFIG_SWSCALE_ALPHA */
2448                 {
2449                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2450                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2451                     *yuv2packedX = yuv2rgbx32_1_X_c;
2452                 }
2453 #endif /* !CONFIG_SMALL */
2454                 break;
2455         case PIX_FMT_RGB24:
2456             *yuv2packed1 = yuv2rgb24_1_c;
2457             *yuv2packed2 = yuv2rgb24_2_c;
2458             *yuv2packedX = yuv2rgb24_X_c;
2459             break;
2460         case PIX_FMT_BGR24:
2461             *yuv2packed1 = yuv2bgr24_1_c;
2462             *yuv2packed2 = yuv2bgr24_2_c;
2463             *yuv2packedX = yuv2bgr24_X_c;
2464             break;
2465         case PIX_FMT_RGB565LE:
2466         case PIX_FMT_RGB565BE:
2467         case PIX_FMT_BGR565LE:
2468         case PIX_FMT_BGR565BE:
2469             *yuv2packed1 = yuv2rgb16_1_c;
2470             *yuv2packed2 = yuv2rgb16_2_c;
2471             *yuv2packedX = yuv2rgb16_X_c;
2472             break;
2473         case PIX_FMT_RGB555LE:
2474         case PIX_FMT_RGB555BE:
2475         case PIX_FMT_BGR555LE:
2476         case PIX_FMT_BGR555BE:
2477             *yuv2packed1 = yuv2rgb15_1_c;
2478             *yuv2packed2 = yuv2rgb15_2_c;
2479             *yuv2packedX = yuv2rgb15_X_c;
2480             break;
2481         case PIX_FMT_RGB444LE:
2482         case PIX_FMT_RGB444BE:
2483         case PIX_FMT_BGR444LE:
2484         case PIX_FMT_BGR444BE:
2485             *yuv2packed1 = yuv2rgb12_1_c;
2486             *yuv2packed2 = yuv2rgb12_2_c;
2487             *yuv2packedX = yuv2rgb12_X_c;
2488             break;
2489         case PIX_FMT_RGB8:
2490         case PIX_FMT_BGR8:
2491             *yuv2packed1 = yuv2rgb8_1_c;
2492             *yuv2packed2 = yuv2rgb8_2_c;
2493             *yuv2packedX = yuv2rgb8_X_c;
2494             break;
2495         case PIX_FMT_RGB4:
2496         case PIX_FMT_BGR4:
2497             *yuv2packed1 = yuv2rgb4_1_c;
2498             *yuv2packed2 = yuv2rgb4_2_c;
2499             *yuv2packedX = yuv2rgb4_X_c;
2500             break;
2501         case PIX_FMT_RGB4_BYTE:
2502         case PIX_FMT_BGR4_BYTE:
2503             *yuv2packed1 = yuv2rgb4b_1_c;
2504             *yuv2packed2 = yuv2rgb4b_2_c;
2505             *yuv2packedX = yuv2rgb4b_X_c;
2506             break;
2507         }
2508     }
2509 }
2510
2511 #define DEBUG_SWSCALE_BUFFERS 0
2512 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2513
2514 static int swScale(SwsContext *c, const uint8_t* src[],
2515                    int srcStride[], int srcSliceY,
2516                    int srcSliceH, uint8_t* dst[], int dstStride[])
2517 {
2518     /* load a few things into local vars to make the code more readable? and faster */
2519     const int srcW= c->srcW;
2520     const int dstW= c->dstW;
2521     const int dstH= c->dstH;
2522     const int chrDstW= c->chrDstW;
2523     const int chrSrcW= c->chrSrcW;
2524     const int lumXInc= c->lumXInc;
2525     const int chrXInc= c->chrXInc;
2526     const enum PixelFormat dstFormat= c->dstFormat;
2527     const int flags= c->flags;
2528     int16_t *vLumFilterPos= c->vLumFilterPos;
2529     int16_t *vChrFilterPos= c->vChrFilterPos;
2530     int16_t *hLumFilterPos= c->hLumFilterPos;
2531     int16_t *hChrFilterPos= c->hChrFilterPos;
2532     int16_t *vLumFilter= c->vLumFilter;
2533     int16_t *vChrFilter= c->vChrFilter;
2534     int16_t *hLumFilter= c->hLumFilter;
2535     int16_t *hChrFilter= c->hChrFilter;
2536     int32_t *lumMmxFilter= c->lumMmxFilter;
2537     int32_t *chrMmxFilter= c->chrMmxFilter;
2538     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2539     const int vLumFilterSize= c->vLumFilterSize;
2540     const int vChrFilterSize= c->vChrFilterSize;
2541     const int hLumFilterSize= c->hLumFilterSize;
2542     const int hChrFilterSize= c->hChrFilterSize;
2543     int16_t **lumPixBuf= c->lumPixBuf;
2544     int16_t **chrUPixBuf= c->chrUPixBuf;
2545     int16_t **chrVPixBuf= c->chrVPixBuf;
2546     int16_t **alpPixBuf= c->alpPixBuf;
2547     const int vLumBufSize= c->vLumBufSize;
2548     const int vChrBufSize= c->vChrBufSize;
2549     uint8_t *formatConvBuffer= c->formatConvBuffer;
2550     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2551     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2552     int lastDstY;
2553     uint32_t *pal=c->pal_yuv;
2554
2555     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2556     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2557     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2558     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2559     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2560     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2561
2562     /* vars which will change and which we need to store back in the context */
2563     int dstY= c->dstY;
2564     int lumBufIndex= c->lumBufIndex;
2565     int chrBufIndex= c->chrBufIndex;
2566     int lastInLumBuf= c->lastInLumBuf;
2567     int lastInChrBuf= c->lastInChrBuf;
2568
2569     if (isPacked(c->srcFormat)) {
2570         src[0]=
2571         src[1]=
2572         src[2]=
2573         src[3]= src[0];
2574         srcStride[0]=
2575         srcStride[1]=
2576         srcStride[2]=
2577         srcStride[3]= srcStride[0];
2578     }
2579     srcStride[1]<<= c->vChrDrop;
2580     srcStride[2]<<= c->vChrDrop;
2581
2582     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2583                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2584                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2585     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2586                    srcSliceY,    srcSliceH,    dstY,    dstH);
2587     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2588                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2589
2590     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2591         static int warnedAlready=0; //FIXME move this into the context perhaps
2592         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2593             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2594                    "         ->cannot do aligned memory accesses anymore\n");
2595             warnedAlready=1;
2596         }
2597     }
2598
2599     /* Note the user might start scaling the picture in the middle so this
2600        will not get executed. This is not really intended but works
2601        currently, so people might do it. */
2602     if (srcSliceY ==0) {
2603         lumBufIndex=-1;
2604         chrBufIndex=-1;
2605         dstY=0;
2606         lastInLumBuf= -1;
2607         lastInChrBuf= -1;
2608     }
2609
2610     if (!should_dither) {
2611         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2612     }
2613     lastDstY= dstY;
2614
2615     for (;dstY < dstH; dstY++) {
2616         const int chrDstY= dstY>>c->chrDstVSubSample;
2617         uint8_t *dest[4] = {
2618             dst[0] + dstStride[0] * dstY,
2619             dst[1] + dstStride[1] * chrDstY,
2620             dst[2] + dstStride[2] * chrDstY,
2621             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2622         };
2623
2624         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2625         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2626         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2627         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2628         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2629         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2630         int enough_lines;
2631
2632         //handle holes (FAST_BILINEAR & weird filters)
2633         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2634         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2635         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2636         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2637
2638         DEBUG_BUFFERS("dstY: %d\n", dstY);
2639         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2640                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2641         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2642                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2643
2644         // Do we have enough lines in this slice to output the dstY line
2645         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2646
2647         if (!enough_lines) {
2648             lastLumSrcY = srcSliceY + srcSliceH - 1;
2649             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2650             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2651                                             lastLumSrcY, lastChrSrcY);
2652         }
2653
2654         //Do horizontal scaling
2655         while(lastInLumBuf < lastLumSrcY) {
2656             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2657             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2658             lumBufIndex++;
2659             assert(lumBufIndex < 2*vLumBufSize);
2660             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2661             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2662             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2663                     hLumFilter, hLumFilterPos, hLumFilterSize,
2664                     formatConvBuffer,
2665                     pal, 0);
2666             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2667                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2668                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2669                         formatConvBuffer,
2670                         pal, 1);
2671             lastInLumBuf++;
2672             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2673                                lumBufIndex,    lastInLumBuf);
2674         }
2675         while(lastInChrBuf < lastChrSrcY) {
2676             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2677             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2678             chrBufIndex++;
2679             assert(chrBufIndex < 2*vChrBufSize);
2680             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2681             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2682             //FIXME replace parameters through context struct (some at least)
2683
2684             if (c->needs_hcscale)
2685                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2686                           chrDstW, src1, src2, chrSrcW, chrXInc,
2687                           hChrFilter, hChrFilterPos, hChrFilterSize,
2688                           formatConvBuffer, pal);
2689             lastInChrBuf++;
2690             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2691                                chrBufIndex,    lastInChrBuf);
2692         }
2693         //wrap buf index around to stay inside the ring buffer
2694         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2695         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2696         if (!enough_lines)
2697             break; //we can't output a dstY line so let's try with the next slice
2698
2699 #if HAVE_MMX
2700         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2701 #endif
2702         if (should_dither) {
2703             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2704             c->lumDither8 = dither_8x8_128[dstY & 7];
2705         }
2706         if (dstY >= dstH-2) {
2707             // hmm looks like we can't use MMX here without overwriting this array's tail
2708             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2709                                            &yuv2packed1, &yuv2packed2,
2710                                            &yuv2packedX);
2711         }
2712
2713         {
2714             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2715             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2716             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2717             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2718
2719             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2720                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2721                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2722                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2723                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2724                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2725                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2726                              dest, dstW, chrDstW);
2727                 } else { //General YV12
2728                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2729                              lumSrcPtr, vLumFilterSize,
2730                              vChrFilter + chrDstY * vChrFilterSize,
2731                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2732                              alpSrcPtr, dest, dstW, chrDstW);
2733                 }
2734             } else {
2735                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2736                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2737                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2738                     int chrAlpha = vChrFilter[2 * dstY + 1];
2739                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2740                                 alpPixBuf ? *alpSrcPtr : NULL,
2741                                 dest[0], dstW, chrAlpha, dstY);
2742                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2743                     int lumAlpha = vLumFilter[2 * dstY + 1];
2744                     int chrAlpha = vChrFilter[2 * dstY + 1];
2745                     lumMmxFilter[2] =
2746                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2747                     chrMmxFilter[2] =
2748                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2749                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2750                                 alpPixBuf ? alpSrcPtr : NULL,
2751                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2752                 } else { //general RGB
2753                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2754                                 lumSrcPtr, vLumFilterSize,
2755                                 vChrFilter + dstY * vChrFilterSize,
2756                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2757                                 alpSrcPtr, dest[0], dstW, dstY);
2758                 }
2759             }
2760         }
2761     }
2762
2763     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2764         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2765
2766 #if HAVE_MMX2
2767     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2768         __asm__ volatile("sfence":::"memory");
2769 #endif
2770     emms_c();
2771
2772     /* store changed local vars back in the context */
2773     c->dstY= dstY;
2774     c->lumBufIndex= lumBufIndex;
2775     c->chrBufIndex= chrBufIndex;
2776     c->lastInLumBuf= lastInLumBuf;
2777     c->lastInChrBuf= lastInChrBuf;
2778
2779     return dstY - lastDstY;
2780 }
2781
2782 static av_cold void sws_init_swScale_c(SwsContext *c)
2783 {
2784     enum PixelFormat srcFormat = c->srcFormat;
2785
2786     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2787                                    &c->yuv2packed1, &c->yuv2packed2,
2788                                    &c->yuv2packedX);
2789
2790     c->chrToYV12 = NULL;
2791     switch(srcFormat) {
2792         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2793         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2794         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2795         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2796         case PIX_FMT_RGB8     :
2797         case PIX_FMT_BGR8     :
2798         case PIX_FMT_PAL8     :
2799         case PIX_FMT_BGR4_BYTE:
2800         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2801 #if HAVE_BIGENDIAN
2802         case PIX_FMT_YUV444P9LE:
2803         case PIX_FMT_YUV420P9LE:
2804         case PIX_FMT_YUV422P10LE:
2805         case PIX_FMT_YUV420P10LE:
2806         case PIX_FMT_YUV444P10LE:
2807         case PIX_FMT_YUV420P16LE:
2808         case PIX_FMT_YUV422P16LE:
2809         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2810 #else
2811         case PIX_FMT_YUV444P9BE:
2812         case PIX_FMT_YUV420P9BE:
2813         case PIX_FMT_YUV444P10BE:
2814         case PIX_FMT_YUV422P10BE:
2815         case PIX_FMT_YUV420P10BE:
2816         case PIX_FMT_YUV420P16BE:
2817         case PIX_FMT_YUV422P16BE:
2818         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2819 #endif
2820     }
2821     if (c->chrSrcHSubSample) {
2822         switch(srcFormat) {
2823         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2824         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2825         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2826         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2827         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2828         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2829         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2830         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2831         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2832         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2833         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2834         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2835         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2836         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2837         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2838         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2839         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2840         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2841         }
2842     } else {
2843         switch(srcFormat) {
2844         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2845         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2846         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2847         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2848         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2849         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2850         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2851         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2852         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2853         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2854         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2855         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2856         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2857         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2858         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2859         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2860         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2861         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2862         }
2863     }
2864
2865     c->lumToYV12 = NULL;
2866     c->alpToYV12 = NULL;
2867     switch (srcFormat) {
2868 #if HAVE_BIGENDIAN
2869     case PIX_FMT_YUV444P9LE:
2870     case PIX_FMT_YUV420P9LE:
2871     case PIX_FMT_YUV422P10LE:
2872     case PIX_FMT_YUV420P10LE:
2873     case PIX_FMT_YUV444P10LE:
2874     case PIX_FMT_YUV420P16LE:
2875     case PIX_FMT_YUV422P16LE:
2876     case PIX_FMT_YUV444P16LE:
2877     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2878 #else
2879     case PIX_FMT_YUV444P9BE:
2880     case PIX_FMT_YUV420P9BE:
2881     case PIX_FMT_YUV444P10BE:
2882     case PIX_FMT_YUV422P10BE:
2883     case PIX_FMT_YUV420P10BE:
2884     case PIX_FMT_YUV420P16BE:
2885     case PIX_FMT_YUV422P16BE:
2886     case PIX_FMT_YUV444P16BE:
2887     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2888 #endif
2889     case PIX_FMT_YUYV422  :
2890     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2891     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2892     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2893     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2894     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2895     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2896     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2897     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2898     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2899     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2900     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2901     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2902     case PIX_FMT_RGB8     :
2903     case PIX_FMT_BGR8     :
2904     case PIX_FMT_PAL8     :
2905     case PIX_FMT_BGR4_BYTE:
2906     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2907     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2908     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2909     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2910     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2911     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2912     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2913     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2914     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2915     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2916     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2917     }
2918     if (c->alpPixBuf) {
2919         switch (srcFormat) {
2920         case PIX_FMT_BGRA:
2921         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2922         case PIX_FMT_ABGR:
2923         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2924         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2925         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
2926         }
2927     }
2928
2929
2930     if (c->srcBpc == 8) {
2931         if (c->dstBpc <= 10) {
2932             c->hyScale = c->hcScale = hScale8To15_c;
2933             if (c->flags & SWS_FAST_BILINEAR) {
2934                 c->hyscale_fast = hyscale_fast_c;
2935                 c->hcscale_fast = hcscale_fast_c;
2936             }
2937         } else {
2938             c->hyScale = c->hcScale = hScale8To19_c;
2939         }
2940     } else {
2941         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
2942     }
2943
2944     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2945         if (c->dstBpc <= 10) {
2946             if (c->srcRange) {
2947                 c->lumConvertRange = lumRangeFromJpeg_c;
2948                 c->chrConvertRange = chrRangeFromJpeg_c;
2949             } else {
2950                 c->lumConvertRange = lumRangeToJpeg_c;
2951                 c->chrConvertRange = chrRangeToJpeg_c;
2952             }
2953         } else {
2954             if (c->srcRange) {
2955                 c->lumConvertRange = lumRangeFromJpeg16_c;
2956                 c->chrConvertRange = chrRangeFromJpeg16_c;
2957             } else {
2958                 c->lumConvertRange = lumRangeToJpeg16_c;
2959                 c->chrConvertRange = chrRangeToJpeg16_c;
2960             }
2961         }
2962     }
2963
2964     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2965           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2966         c->needs_hcscale = 1;
2967 }
2968
2969 SwsFunc ff_getSwsFunc(SwsContext *c)
2970 {
2971     sws_init_swScale_c(c);
2972
2973     if (HAVE_MMX)
2974         ff_sws_init_swScale_mmx(c);
2975     if (HAVE_ALTIVEC)
2976         ff_sws_init_swScale_altivec(c);
2977
2978     return swScale;
2979 }