git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/avassert.h"
  64 #include "libavutil/intreadwrite.h"
  65 #include "libavutil/cpu.h"
  66 #include "libavutil/avutil.h"
  67 #include "libavutil/mathematics.h"
  68 #include "libavutil/bswap.h"
  69 #include "libavutil/pixdesc.h"
  70
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 186 {  36, 68, 60, 92, 34, 66, 58, 90,},
 187 { 100,  4,124, 28, 98,  2,122, 26,},
 188 {  52, 84, 44, 76, 50, 82, 42, 74,},
 189 { 116, 20,108, 12,114, 18,106, 10,},
 190 {  32, 64, 56, 88, 38, 70, 62, 94,},
 191 {  96,  0,120, 24,102,  6,126, 30,},
 192 {  48, 80, 40, 72, 54, 86, 46, 78,},
 193 { 112, 16,104,  8,118, 22,110, 14,},
 194 };
 195 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 196 {  64, 64, 64, 64, 64, 64, 64, 64 };
 197
 198 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 199 {
 200   {   0,  1,  0,  1,  0,  1,  0,  1,},
 201   {   1,  0,  1,  0,  1,  0,  1,  0,},
 202   {   0,  1,  0,  1,  0,  1,  0,  1,},
 203   {   1,  0,  1,  0,  1,  0,  1,  0,},
 204   {   0,  1,  0,  1,  0,  1,  0,  1,},
 205   {   1,  0,  1,  0,  1,  0,  1,  0,},
 206   {   0,  1,  0,  1,  0,  1,  0,  1,},
 207   {   1,  0,  1,  0,  1,  0,  1,  0,},
 208 },{
 209   {   1,  2,  1,  2,  1,  2,  1,  2,},
 210   {   3,  0,  3,  0,  3,  0,  3,  0,},
 211   {   1,  2,  1,  2,  1,  2,  1,  2,},
 212   {   3,  0,  3,  0,  3,  0,  3,  0,},
 213   {   1,  2,  1,  2,  1,  2,  1,  2,},
 214   {   3,  0,  3,  0,  3,  0,  3,  0,},
 215   {   1,  2,  1,  2,  1,  2,  1,  2,},
 216   {   3,  0,  3,  0,  3,  0,  3,  0,},
 217 },{
 218   {   2,  4,  3,  5,  2,  4,  3,  5,},
 219   {   6,  0,  7,  1,  6,  0,  7,  1,},
 220   {   3,  5,  2,  4,  3,  5,  2,  4,},
 221   {   7,  1,  6,  0,  7,  1,  6,  0,},
 222   {   2,  4,  3,  5,  2,  4,  3,  5,},
 223   {   6,  0,  7,  1,  6,  0,  7,  1,},
 224   {   3,  5,  2,  4,  3,  5,  2,  4,},
 225   {   7,  1,  6,  0,  7,  1,  6,  0,},
 226 },{
 227   {   4,  8,  7, 11,  4,  8,  7, 11,},
 228   {  12,  0, 15,  3, 12,  0, 15,  3,},
 229   {   6, 10,  5,  9,  6, 10,  5,  9,},
 230   {  14,  2, 13,  1, 14,  2, 13,  1,},
 231   {   4,  8,  7, 11,  4,  8,  7, 11,},
 232   {  12,  0, 15,  3, 12,  0, 15,  3,},
 233   {   6, 10,  5,  9,  6, 10,  5,  9,},
 234   {  14,  2, 13,  1, 14,  2, 13,  1,},
 235 },{
 236   {   9, 17, 15, 23,  8, 16, 14, 22,},
 237   {  25,  1, 31,  7, 24,  0, 30,  6,},
 238   {  13, 21, 11, 19, 12, 20, 10, 18,},
 239   {  29,  5, 27,  3, 28,  4, 26,  2,},
 240   {   8, 16, 14, 22,  9, 17, 15, 23,},
 241   {  24,  0, 30,  6, 25,  1, 31,  7,},
 242   {  12, 20, 10, 18, 13, 21, 11, 19,},
 243   {  28,  4, 26,  2, 29,  5, 27,  3,},
 244 },{
 245   {  18, 34, 30, 46, 17, 33, 29, 45,},
 246   {  50,  2, 62, 14, 49,  1, 61, 13,},
 247   {  26, 42, 22, 38, 25, 41, 21, 37,},
 248   {  58, 10, 54,  6, 57,  9, 53,  5,},
 249   {  16, 32, 28, 44, 19, 35, 31, 47,},
 250   {  48,  0, 60, 12, 51,  3, 63, 15,},
 251   {  24, 40, 20, 36, 27, 43, 23, 39,},
 252   {  56,  8, 52,  4, 59, 11, 55,  7,},
 253 },{
 254   {  18, 34, 30, 46, 17, 33, 29, 45,},
 255   {  50,  2, 62, 14, 49,  1, 61, 13,},
 256   {  26, 42, 22, 38, 25, 41, 21, 37,},
 257   {  58, 10, 54,  6, 57,  9, 53,  5,},
 258   {  16, 32, 28, 44, 19, 35, 31, 47,},
 259   {  48,  0, 60, 12, 51,  3, 63, 15,},
 260   {  24, 40, 20, 36, 27, 43, 23, 39,},
 261   {  56,  8, 52,  4, 59, 11, 55,  7,},
 262 },{
 263   {  36, 68, 60, 92, 34, 66, 58, 90,},
 264   { 100,  4,124, 28, 98,  2,122, 26,},
 265   {  52, 84, 44, 76, 50, 82, 42, 74,},
 266   { 116, 20,108, 12,114, 18,106, 10,},
 267   {  32, 64, 56, 88, 38, 70, 62, 94,},
 268   {  96,  0,120, 24,102,  6,126, 30,},
 269   {  48, 80, 40, 72, 54, 86, 46, 78,},
 270   { 112, 16,104,  8,118, 22,110, 14,},
 271 }};
 272
 273 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 274
 275 const uint16_t dither_scale[15][16]={
 276 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 277 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 278 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 279 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 280 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 281 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 282 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 283 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 284 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 285 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 286 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 287 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 288 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 289 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 290 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 291 };
 292
 293 static av_always_inline void
 294 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 295                       int lumFilterSize, const int16_t *chrFilter,
 296                       const int32_t **chrUSrc, const int32_t **chrVSrc,
 297                       int chrFilterSize, const int32_t **alpSrc,
 298                       uint16_t *dest[4], int dstW, int chrDstW,
 299                       int big_endian, int output_bits)
 300 {
 301     //FIXME Optimize (just quickly written not optimized..)
 302     int i;
 303     int dword= output_bits == 16;
 304     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 305              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 306     int shift = 11 + 4*dword + 16 - output_bits - 1;
 307
 308 #define output_pixel(pos, val) \
 309     if (big_endian) { \
 310         AV_WB16(pos, av_clip_uint16(val >> shift)); \
 311     } else { \
 312         AV_WL16(pos, av_clip_uint16(val >> shift)); \
 313     }
 314     for (i = 0; i < dstW; i++) {
 315         int val = 1 << (26-output_bits + 4*dword - 1);
 316         int j;
 317
 318         for (j = 0; j < lumFilterSize; j++)
 319             val += ((dword ? lumSrc[j][i] : ((int16_t**)lumSrc)[j][i]) * lumFilter[j])>>1;
 320
 321         output_pixel(&yDest[i], val);
 322     }
 323
 324     if (uDest) {
 325         for (i = 0; i < chrDstW; i++) {
 326             int u = 1 << (26-output_bits + 4*dword - 1);
 327             int v = 1 << (26-output_bits + 4*dword - 1);
 328             int j;
 329
 330             for (j = 0; j < chrFilterSize; j++) {
 331                 u += ((dword ? chrUSrc[j][i] : ((int16_t**)chrUSrc)[j][i]) * chrFilter[j]) >> 1;
 332                 v += ((dword ? chrVSrc[j][i] : ((int16_t**)chrVSrc)[j][i]) * chrFilter[j]) >> 1;
 333             }
 334
 335             output_pixel(&uDest[i], u);
 336             output_pixel(&vDest[i], v);
 337         }
 338     }
 339
 340     if (CONFIG_SWSCALE_ALPHA && aDest) {
 341         for (i = 0; i < dstW; i++) {
 342             int val = 1 << (26-output_bits + 4*dword - 1);
 343             int j;
 344
 345             for (j = 0; j < lumFilterSize; j++)
 346                 val += ((dword ? alpSrc[j][i] : ((int16_t**)alpSrc)[j][i]) * lumFilter[j]) >> 1;
 347
 348             output_pixel(&aDest[i], val);
 349         }
 350     }
 351 #undef output_pixel
 352 }
 353
 354 static av_always_inline void
 355 yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 356                       int lumFilterSize, const int16_t *chrFilter,
 357                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 358                       int chrFilterSize, const int16_t **alpSrc,
 359                       uint16_t *dest[4], int dstW, int chrDstW,
 360                       int big_endian, int output_bits)
 361 {
 362     //FIXME Optimize (just quickly written not optimized..)
 363     int i;
 364     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 365              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 366     int shift = 11 + 16 - output_bits - 1;
 367
 368 #define output_pixel(pos, val) \
 369     if (big_endian) { \
 370         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 371     } else { \
 372         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 373     }
 374     for (i = 0; i < dstW; i++) {
 375         int val = 1 << (26-output_bits - 1);
 376         int j;
 377
 378         for (j = 0; j < lumFilterSize; j++)
 379             val += (lumSrc[j][i] * lumFilter[j]) >> 1;
 380
 381         output_pixel(&yDest[i], val);
 382     }
 383
 384     if (uDest) {
 385         for (i = 0; i < chrDstW; i++) {
 386             int u = 1 << (26-output_bits - 1);
 387             int v = 1 << (26-output_bits - 1);
 388             int j;
 389
 390             for (j = 0; j < chrFilterSize; j++) {
 391                 u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
 392                 v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
 393             }
 394
 395             output_pixel(&uDest[i], u);
 396             output_pixel(&vDest[i], v);
 397         }
 398     }
 399
 400     if (CONFIG_SWSCALE_ALPHA && aDest) {
 401         for (i = 0; i < dstW; i++) {
 402             int val = 1 << (26-output_bits - 1);
 403             int j;
 404
 405             for (j = 0; j < lumFilterSize; j++)
 406                 val += (alpSrc[j][i] * lumFilter[j]) >> 1;
 407
 408             output_pixel(&aDest[i], val);
 409         }
 410     }
 411 #undef output_pixel
 412 }
 413
 414 #define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \
 415 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 416                               const int16_t **_lumSrc, int lumFilterSize, \
 417                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
 418                               const int16_t **_chrVSrc, \
 419                               int chrFilterSize, const int16_t **_alpSrc, \
 420                               uint8_t *_dest[4], int dstW, int chrDstW) \
 421 { \
 422     const typeX_t **lumSrc  = (const typeX_t **) _lumSrc, \
 423                   **chrUSrc = (const typeX_t **) _chrUSrc, \
 424                   **chrVSrc = (const typeX_t **) _chrVSrc, \
 425                   **alpSrc  = (const typeX_t **) _alpSrc; \
 426     yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \
 427                          chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 428                          alpSrc, (uint16_t **) _dest, \
 429                          dstW, chrDstW, is_be, bits); \
 430 }
 431 yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t);
 432 yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t);
 433 yuv2NBPS(10, BE, 1, yuv2yuvX10_c_template, int16_t);
 434 yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t);
 435 yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t);
 436 yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t);
 437
 438 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 439                        const int16_t **lumSrc, int lumFilterSize,
 440                        const int16_t *chrFilter, const int16_t **chrUSrc,
 441                        const int16_t **chrVSrc,
 442                        int chrFilterSize, const int16_t **alpSrc,
 443                        uint8_t *dest[4], int dstW, int chrDstW)
 444 {
 445     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 446             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 447     int i;
 448     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 449
 450     //FIXME Optimize (just quickly written not optimized..)
 451     for (i=0; i<dstW; i++) {
 452         int val = lumDither[i & 7] << 12;
 453         int j;
 454         for (j=0; j<lumFilterSize; j++)
 455             val += lumSrc[j][i] * lumFilter[j];
 456
 457         yDest[i]= av_clip_uint8(val>>19);
 458     }
 459
 460     if (uDest)
 461         for (i=0; i<chrDstW; i++) {
 462             int u = chrDither[i & 7] << 12;
 463             int v = chrDither[(i + 3) & 7] << 12;
 464             int j;
 465             for (j=0; j<chrFilterSize; j++) {
 466                 u += chrUSrc[j][i] * chrFilter[j];
 467                 v += chrVSrc[j][i] * chrFilter[j];
 468             }
 469
 470             uDest[i]= av_clip_uint8(u>>19);
 471             vDest[i]= av_clip_uint8(v>>19);
 472         }
 473
 474     if (CONFIG_SWSCALE_ALPHA && aDest)
 475         for (i=0; i<dstW; i++) {
 476             int val = lumDither[i & 7] << 12;
 477             int j;
 478             for (j=0; j<lumFilterSize; j++)
 479                 val += alpSrc[j][i] * lumFilter[j];
 480
 481             aDest[i]= av_clip_uint8(val>>19);
 482         }
 483 }
 484
 485 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 486                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 487                        const int16_t *alpSrc,
 488                        uint8_t *dest[4], int dstW, int chrDstW)
 489 {
 490     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 491             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 492     int i;
 493     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 494
 495     for (i=0; i<dstW; i++) {
 496         int val = (lumSrc[i]+  lumDither[i & 7]) >> 7;
 497         yDest[i]= av_clip_uint8(val);
 498     }
 499
 500     if (uDest)
 501         for (i=0; i<chrDstW; i++) {
 502             int u = (chrUSrc[i] + chrDither[i & 7])       >> 7;
 503             int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7;
 504             uDest[i]= av_clip_uint8(u);
 505             vDest[i]= av_clip_uint8(v);
 506         }
 507
 508     if (CONFIG_SWSCALE_ALPHA && aDest)
 509         for (i=0; i<dstW; i++) {
 510             int val = (alpSrc[i] + lumDither[i & 7]) >> 7;
 511             aDest[i]= av_clip_uint8(val);
 512         }
 513 }
 514
 515 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 516                         const int16_t **lumSrc, int lumFilterSize,
 517                         const int16_t *chrFilter, const int16_t **chrUSrc,
 518                         const int16_t **chrVSrc, int chrFilterSize,
 519                         const int16_t **alpSrc, uint8_t *dest[4],
 520                         int dstW, int chrDstW)
 521 {
 522     uint8_t *yDest = dest[0], *uDest = dest[1];
 523     enum PixelFormat dstFormat = c->dstFormat;
 524     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 525
 526     //FIXME Optimize (just quickly written not optimized..)
 527     int i;
 528     for (i=0; i<dstW; i++) {
 529         int val = lumDither[i & 7] << 12;
 530         int j;
 531         for (j=0; j<lumFilterSize; j++)
 532             val += lumSrc[j][i] * lumFilter[j];
 533
 534         yDest[i]= av_clip_uint8(val>>19);
 535     }
 536
 537     if (!uDest)
 538         return;
 539
 540     if (dstFormat == PIX_FMT_NV12)
 541         for (i=0; i<chrDstW; i++) {
 542             int u = chrDither[i & 7] << 12;
 543             int v = chrDither[(i + 3) & 7] << 12;
 544             int j;
 545             for (j=0; j<chrFilterSize; j++) {
 546                 u += chrUSrc[j][i] * chrFilter[j];
 547                 v += chrVSrc[j][i] * chrFilter[j];
 548             }
 549
 550             uDest[2*i]= av_clip_uint8(u>>19);
 551             uDest[2*i+1]= av_clip_uint8(v>>19);
 552         }
 553     else
 554         for (i=0; i<chrDstW; i++) {
 555             int u = chrDither[i & 7] << 12;
 556             int v = chrDither[(i + 3) & 7] << 12;
 557             int j;
 558             for (j=0; j<chrFilterSize; j++) {
 559                 u += chrUSrc[j][i] * chrFilter[j];
 560                 v += chrVSrc[j][i] * chrFilter[j];
 561             }
 562
 563             uDest[2*i]= av_clip_uint8(v>>19);
 564             uDest[2*i+1]= av_clip_uint8(u>>19);
 565         }
 566 }
 567
 568 #define output_pixel(pos, val) \
 569         if (target == PIX_FMT_GRAY16BE) { \
 570             AV_WB16(pos, val); \
 571         } else { \
 572             AV_WL16(pos, val); \
 573         }
 574
 575 static av_always_inline void
 576 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 577                         const int32_t **lumSrc, int lumFilterSize,
 578                         const int16_t *chrFilter, const int32_t **chrUSrc,
 579                         const int32_t **chrVSrc, int chrFilterSize,
 580                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 581                         int y, enum PixelFormat target)
 582 {
 583     int i;
 584
 585     for (i = 0; i < (dstW >> 1); i++) {
 586         int j;
 587         int Y1 = 1 << 14;
 588         int Y2 = 1 << 14;
 589
 590         for (j = 0; j < lumFilterSize; j++) {
 591             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 592             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 593         }
 594         Y1 >>= 15;
 595         Y2 >>= 15;
 596         if ((Y1 | Y2) & 0x10000) {
 597             Y1 = av_clip_uint16(Y1);
 598             Y2 = av_clip_uint16(Y2);
 599         }
 600         output_pixel(&dest[i * 2 + 0], Y1);
 601         output_pixel(&dest[i * 2 + 1], Y2);
 602     }
 603 }
 604
 605 static av_always_inline void
 606 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 607                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 608                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 609                         int yalpha, int uvalpha, int y,
 610                         enum PixelFormat target)
 611 {
 612     int  yalpha1 = 4095 - yalpha;
 613     int i;
 614     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 615
 616     for (i = 0; i < (dstW >> 1); i++) {
 617         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 618         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 619
 620         output_pixel(&dest[i * 2 + 0], Y1);
 621         output_pixel(&dest[i * 2 + 1], Y2);
 622     }
 623 }
 624
 625 static av_always_inline void
 626 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 627                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 628                         const int32_t *abuf0, uint16_t *dest, int dstW,
 629                         int uvalpha, int y, enum PixelFormat target)
 630 {
 631     int i;
 632
 633     for (i = 0; i < (dstW >> 1); i++) {
 634         int Y1 = (buf0[i * 2    ]+4)>>3;
 635         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 636
 637         output_pixel(&dest[i * 2 + 0], Y1);
 638         output_pixel(&dest[i * 2 + 1], Y2);
 639     }
 640 }
 641
 642 #undef output_pixel
 643
 644 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 645 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 646                         const int16_t **_lumSrc, int lumFilterSize, \
 647                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 648                         const int16_t **_chrVSrc, int chrFilterSize, \
 649                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 650                         int y) \
 651 { \
 652     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 653                   **chrUSrc = (const int32_t **) _chrUSrc, \
 654                   **chrVSrc = (const int32_t **) _chrVSrc, \
 655                   **alpSrc  = (const int32_t **) _alpSrc; \
 656     uint16_t *dest = (uint16_t *) _dest; \
 657     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 658                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 659                           alpSrc, dest, dstW, y, fmt); \
 660 } \
 661  \
 662 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 663                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 664                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 665                         int yalpha, int uvalpha, int y) \
 666 { \
 667     const int32_t **buf  = (const int32_t **) _buf, \
 668                   **ubuf = (const int32_t **) _ubuf, \
 669                   **vbuf = (const int32_t **) _vbuf, \
 670                   **abuf = (const int32_t **) _abuf; \
 671     uint16_t *dest = (uint16_t *) _dest; \
 672     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 673                           dest, dstW, yalpha, uvalpha, y, fmt); \
 674 } \
 675  \
 676 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 677                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 678                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 679                         int uvalpha, int y) \
 680 { \
 681     const int32_t *buf0  = (const int32_t *)  _buf0, \
 682                  **ubuf  = (const int32_t **) _ubuf, \
 683                  **vbuf  = (const int32_t **) _vbuf, \
 684                   *abuf0 = (const int32_t *)  _abuf0; \
 685     uint16_t *dest = (uint16_t *) _dest; \
 686     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 687                                   dstW, uvalpha, y, fmt); \
 688 }
 689
 690 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 691 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 692
 693 #define output_pixel(pos, acc) \
 694     if (target == PIX_FMT_MONOBLACK) { \
 695         pos = acc; \
 696     } else { \
 697         pos = ~acc; \
 698     }
 699
 700 static av_always_inline void
 701 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 702                       const int16_t **lumSrc, int lumFilterSize,
 703                       const int16_t *chrFilter, const int16_t **chrUSrc,
 704                       const int16_t **chrVSrc, int chrFilterSize,
 705                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 706                       int y, enum PixelFormat target)
 707 {
 708     const uint8_t * const d128=dither_8x8_220[y&7];
 709     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 710     int i;
 711     int acc = 0;
 712
 713     for (i = 0; i < dstW - 1; i += 2) {
 714         int j;
 715         int Y1 = 1 << 18;
 716         int Y2 = 1 << 18;
 717
 718         for (j = 0; j < lumFilterSize; j++) {
 719             Y1 += lumSrc[j][i]   * lumFilter[j];
 720             Y2 += lumSrc[j][i+1] * lumFilter[j];
 721         }
 722         Y1 >>= 19;
 723         Y2 >>= 19;
 724         if ((Y1 | Y2) & 0x100) {
 725             Y1 = av_clip_uint8(Y1);
 726             Y2 = av_clip_uint8(Y2);
 727         }
 728         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 729         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 730         if ((i & 7) == 6) {
 731             output_pixel(*dest++, acc);
 732         }
 733     }
 734 }
 735
 736 static av_always_inline void
 737 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 738                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 739                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 740                       int yalpha, int uvalpha, int y,
 741                       enum PixelFormat target)
 742 {
 743     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 744     const uint8_t * const d128 = dither_8x8_220[y & 7];
 745     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 746     int  yalpha1 = 4095 - yalpha;
 747     int i;
 748
 749     for (i = 0; i < dstW - 7; i += 8) {
 750         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 751         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 752         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 753         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 754         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 755         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 756         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 757         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 758         output_pixel(*dest++, acc);
 759     }
 760 }
 761
 762 static av_always_inline void
 763 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 764                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 765                       const int16_t *abuf0, uint8_t *dest, int dstW,
 766                       int uvalpha, int y, enum PixelFormat target)
 767 {
 768     const uint8_t * const d128 = dither_8x8_220[y & 7];
 769     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 770     int i;
 771
 772     for (i = 0; i < dstW - 7; i += 8) {
 773         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 774         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 775         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 776         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 777         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 778         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 779         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 780         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 781         output_pixel(*dest++, acc);
 782     }
 783 }
 784
 785 #undef output_pixel
 786
 787 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 788 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 789                                 const int16_t **lumSrc, int lumFilterSize, \
 790                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 791                                 const int16_t **chrVSrc, int chrFilterSize, \
 792                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 793                                 int y) \
 794 { \
 795     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 796                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 797                                   alpSrc, dest, dstW, y, fmt); \
 798 } \
 799  \
 800 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 801                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 802                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 803                                 int yalpha, int uvalpha, int y) \
 804 { \
 805     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 806                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 807 } \
 808  \
 809 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 810                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 811                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 812                                 int uvalpha, int y) \
 813 { \
 814     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 815                                   abuf0, dest, dstW, uvalpha, \
 816                                   y, fmt); \
 817 }
 818
 819 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 820 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 821
 822 #define output_pixels(pos, Y1, U, Y2, V) \
 823     if (target == PIX_FMT_YUYV422) { \
 824         dest[pos + 0] = Y1; \
 825         dest[pos + 1] = U;  \
 826         dest[pos + 2] = Y2; \
 827         dest[pos + 3] = V;  \
 828     } else { \
 829         dest[pos + 0] = U;  \
 830         dest[pos + 1] = Y1; \
 831         dest[pos + 2] = V;  \
 832         dest[pos + 3] = Y2; \
 833     }
 834
 835 static av_always_inline void
 836 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 837                      const int16_t **lumSrc, int lumFilterSize,
 838                      const int16_t *chrFilter, const int16_t **chrUSrc,
 839                      const int16_t **chrVSrc, int chrFilterSize,
 840                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 841                      int y, enum PixelFormat target)
 842 {
 843     int i;
 844
 845     for (i = 0; i < (dstW >> 1); i++) {
 846         int j;
 847         int Y1 = 1 << 18;
 848         int Y2 = 1 << 18;
 849         int U  = 1 << 18;
 850         int V  = 1 << 18;
 851
 852         for (j = 0; j < lumFilterSize; j++) {
 853             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 854             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 855         }
 856         for (j = 0; j < chrFilterSize; j++) {
 857             U += chrUSrc[j][i] * chrFilter[j];
 858             V += chrVSrc[j][i] * chrFilter[j];
 859         }
 860         Y1 >>= 19;
 861         Y2 >>= 19;
 862         U  >>= 19;
 863         V  >>= 19;
 864         if ((Y1 | Y2 | U | V) & 0x100) {
 865             Y1 = av_clip_uint8(Y1);
 866             Y2 = av_clip_uint8(Y2);
 867             U  = av_clip_uint8(U);
 868             V  = av_clip_uint8(V);
 869         }
 870         output_pixels(4*i, Y1, U, Y2, V);
 871     }
 872 }
 873
 874 static av_always_inline void
 875 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 876                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 877                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 878                      int yalpha, int uvalpha, int y,
 879                      enum PixelFormat target)
 880 {
 881     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 882                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 883                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 884     int  yalpha1 = 4095 - yalpha;
 885     int uvalpha1 = 4095 - uvalpha;
 886     int i;
 887
 888     for (i = 0; i < (dstW >> 1); i++) {
 889         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 890         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 891         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 892         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 893
 894         output_pixels(i * 4, Y1, U, Y2, V);
 895     }
 896 }
 897
 898 static av_always_inline void
 899 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 900                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 901                      const int16_t *abuf0, uint8_t *dest, int dstW,
 902                      int uvalpha, int y, enum PixelFormat target)
 903 {
 904     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 905                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 906     int i;
 907
 908     if (uvalpha < 2048) {
 909         for (i = 0; i < (dstW >> 1); i++) {
 910             int Y1 = buf0[i * 2]     >> 7;
 911             int Y2 = buf0[i * 2 + 1] >> 7;
 912             int U  = ubuf1[i]        >> 7;
 913             int V  = vbuf1[i]        >> 7;
 914
 915             output_pixels(i * 4, Y1, U, Y2, V);
 916         }
 917     } else {
 918         for (i = 0; i < (dstW >> 1); i++) {
 919             int Y1 =  buf0[i * 2]          >> 7;
 920             int Y2 =  buf0[i * 2 + 1]      >> 7;
 921             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 922             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 923
 924             output_pixels(i * 4, Y1, U, Y2, V);
 925         }
 926     }
 927 }
 928
 929 #undef output_pixels
 930
 931 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 932 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 933
 934 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 935 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 936 #define output_pixel(pos, val) \
 937     if (isBE(target)) { \
 938         AV_WB16(pos, val); \
 939     } else { \
 940         AV_WL16(pos, val); \
 941     }
 942
 943 static av_always_inline void
 944 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 945                        const int32_t **lumSrc, int lumFilterSize,
 946                        const int16_t *chrFilter, const int32_t **chrUSrc,
 947                        const int32_t **chrVSrc, int chrFilterSize,
 948                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 949                        int y, enum PixelFormat target)
 950 {
 951     int i;
 952
 953     for (i = 0; i < (dstW >> 1); i++) {
 954         int j;
 955         int Y1 = 0;
 956         int Y2 = 0;
 957         int U  = -128 << 23; // 19
 958         int V  = -128 << 23;
 959         int R, G, B;
 960
 961         for (j = 0; j < lumFilterSize; j++) {
 962             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 963             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 964         }
 965         for (j = 0; j < chrFilterSize; j++) {
 966             U += chrUSrc[j][i] * chrFilter[j];
 967             V += chrVSrc[j][i] * chrFilter[j];
 968         }
 969
 970         // 8bit: 12+15=27; 16-bit: 12+19=31
 971         Y1 >>= 14; // 10
 972         Y2 >>= 14;
 973         U  >>= 14;
 974         V  >>= 14;
 975
 976         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 977         Y1 -= c->yuv2rgb_y_offset;
 978         Y2 -= c->yuv2rgb_y_offset;
 979         Y1 *= c->yuv2rgb_y_coeff;
 980         Y2 *= c->yuv2rgb_y_coeff;
 981         Y1 += 1 << 13; // 21
 982         Y2 += 1 << 13;
 983         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 984
 985         R = V * c->yuv2rgb_v2r_coeff;
 986         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 987         B =                            U * c->yuv2rgb_u2b_coeff;
 988
 989         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 990         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 991         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 992         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 993         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 994         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 995         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 996         dest += 6;
 997     }
 998 }
 999
1000 static av_always_inline void
1001 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
1002                        const int32_t *ubuf[2], const int32_t *vbuf[2],
1003                        const int32_t *abuf[2], uint16_t *dest, int dstW,
1004                        int yalpha, int uvalpha, int y,
1005                        enum PixelFormat target)
1006 {
1007     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
1008                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1009                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1010     int  yalpha1 = 4095 - yalpha;
1011     int uvalpha1 = 4095 - uvalpha;
1012     int i;
1013
1014     for (i = 0; i < (dstW >> 1); i++) {
1015         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
1016         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
1017         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
1018         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
1019         int R, G, B;
1020
1021         Y1 -= c->yuv2rgb_y_offset;
1022         Y2 -= c->yuv2rgb_y_offset;
1023         Y1 *= c->yuv2rgb_y_coeff;
1024         Y2 *= c->yuv2rgb_y_coeff;
1025         Y1 += 1 << 13;
1026         Y2 += 1 << 13;
1027
1028         R = V * c->yuv2rgb_v2r_coeff;
1029         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1030         B =                            U * c->yuv2rgb_u2b_coeff;
1031
1032         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1033         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1034         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1035         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1036         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1037         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1038         dest += 6;
1039     }
1040 }
1041
1042 static av_always_inline void
1043 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
1044                        const int32_t *ubuf[2], const int32_t *vbuf[2],
1045                        const int32_t *abuf0, uint16_t *dest, int dstW,
1046                        int uvalpha, int y, enum PixelFormat target)
1047 {
1048     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1049                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1050     int i;
1051
1052     if (uvalpha < 2048) {
1053         for (i = 0; i < (dstW >> 1); i++) {
1054             int Y1 = (buf0[i * 2]    ) >> 2;
1055             int Y2 = (buf0[i * 2 + 1]) >> 2;
1056             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
1057             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
1058             int R, G, B;
1059
1060             Y1 -= c->yuv2rgb_y_offset;
1061             Y2 -= c->yuv2rgb_y_offset;
1062             Y1 *= c->yuv2rgb_y_coeff;
1063             Y2 *= c->yuv2rgb_y_coeff;
1064             Y1 += 1 << 13;
1065             Y2 += 1 << 13;
1066
1067             R = V * c->yuv2rgb_v2r_coeff;
1068             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1069             B =                            U * c->yuv2rgb_u2b_coeff;
1070
1071             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1072             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1073             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1074             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1075             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1076             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1077             dest += 6;
1078         }
1079     } else {
1080         for (i = 0; i < (dstW >> 1); i++) {
1081             int Y1 = (buf0[i * 2]    ) >> 2;
1082             int Y2 = (buf0[i * 2 + 1]) >> 2;
1083             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
1084             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
1085             int R, G, B;
1086
1087             Y1 -= c->yuv2rgb_y_offset;
1088             Y2 -= c->yuv2rgb_y_offset;
1089             Y1 *= c->yuv2rgb_y_coeff;
1090             Y2 *= c->yuv2rgb_y_coeff;
1091             Y1 += 1 << 13;
1092             Y2 += 1 << 13;
1093
1094             R = V * c->yuv2rgb_v2r_coeff;
1095             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1096             B =                            U * c->yuv2rgb_u2b_coeff;
1097
1098             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1099             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1100             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1101             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1102             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1103             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1104             dest += 6;
1105         }
1106     }
1107 }
1108
1109 #undef output_pixel
1110 #undef r_b
1111 #undef b_r
1112
1113 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
1114 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
1115 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
1116 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
1117
1118 static av_always_inline void
1119 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
1120               int U, int V, int A1, int A2,
1121               const void *_r, const void *_g, const void *_b, int y,
1122               enum PixelFormat target, int hasAlpha)
1123 {
1124     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
1125         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1126         uint32_t *dest = (uint32_t *) _dest;
1127         const uint32_t *r = (const uint32_t *) _r;
1128         const uint32_t *g = (const uint32_t *) _g;
1129         const uint32_t *b = (const uint32_t *) _b;
1130
1131 #if CONFIG_SMALL
1132         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1133
1134         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1135         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1136 #else
1137         if (hasAlpha) {
1138             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1139
1140             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1141             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1142         } else {
1143             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1144             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1145         }
1146 #endif
1147     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1148         uint8_t *dest = (uint8_t *) _dest;
1149         const uint8_t *r = (const uint8_t *) _r;
1150         const uint8_t *g = (const uint8_t *) _g;
1151         const uint8_t *b = (const uint8_t *) _b;
1152
1153 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1154 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1155
1156         dest[i * 6 + 0] = r_b[Y1];
1157         dest[i * 6 + 1] =   g[Y1];
1158         dest[i * 6 + 2] = b_r[Y1];
1159         dest[i * 6 + 3] = r_b[Y2];
1160         dest[i * 6 + 4] =   g[Y2];
1161         dest[i * 6 + 5] = b_r[Y2];
1162 #undef r_b
1163 #undef b_r
1164     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1165                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1166                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1167         uint16_t *dest = (uint16_t *) _dest;
1168         const uint16_t *r = (const uint16_t *) _r;
1169         const uint16_t *g = (const uint16_t *) _g;
1170         const uint16_t *b = (const uint16_t *) _b;
1171         int dr1, dg1, db1, dr2, dg2, db2;
1172
1173         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1174             dr1 = dither_2x2_8[ y & 1     ][0];
1175             dg1 = dither_2x2_4[ y & 1     ][0];
1176             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1177             dr2 = dither_2x2_8[ y & 1     ][1];
1178             dg2 = dither_2x2_4[ y & 1     ][1];
1179             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1180         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1181             dr1 = dither_2x2_8[ y & 1     ][0];
1182             dg1 = dither_2x2_8[ y & 1     ][1];
1183             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1184             dr2 = dither_2x2_8[ y & 1     ][1];
1185             dg2 = dither_2x2_8[ y & 1     ][0];
1186             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1187         } else {
1188             dr1 = dither_4x4_16[ y & 3     ][0];
1189             dg1 = dither_4x4_16[ y & 3     ][1];
1190             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1191             dr2 = dither_4x4_16[ y & 3     ][1];
1192             dg2 = dither_4x4_16[ y & 3     ][0];
1193             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1194         }
1195
1196         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1197         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1198     } else /* 8/4-bit */ {
1199         uint8_t *dest = (uint8_t *) _dest;
1200         const uint8_t *r = (const uint8_t *) _r;
1201         const uint8_t *g = (const uint8_t *) _g;
1202         const uint8_t *b = (const uint8_t *) _b;
1203         int dr1, dg1, db1, dr2, dg2, db2;
1204
1205         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1206             const uint8_t * const d64 = dither_8x8_73[y & 7];
1207             const uint8_t * const d32 = dither_8x8_32[y & 7];
1208             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1209             db1 =       d64[(i * 2 + 0) & 7];
1210             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1211             db2 =       d64[(i * 2 + 1) & 7];
1212         } else {
1213             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1214             const uint8_t * const d128 = dither_8x8_220[y & 7];
1215             dr1 = db1 = d128[(i * 2 + 0) & 7];
1216             dg1 =        d64[(i * 2 + 0) & 7];
1217             dr2 = db2 = d128[(i * 2 + 1) & 7];
1218             dg2 =        d64[(i * 2 + 1) & 7];
1219         }
1220
1221         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1222             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1223                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1224         } else {
1225             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1226             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1227         }
1228     }
1229 }
1230
1231 static av_always_inline void
1232 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1233                      const int16_t **lumSrc, int lumFilterSize,
1234                      const int16_t *chrFilter, const int16_t **chrUSrc,
1235                      const int16_t **chrVSrc, int chrFilterSize,
1236                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1237                      int y, enum PixelFormat target, int hasAlpha)
1238 {
1239     int i;
1240
1241     for (i = 0; i < (dstW >> 1); i++) {
1242         int j;
1243         int Y1 = 1 << 18;
1244         int Y2 = 1 << 18;
1245         int U  = 1 << 18;
1246         int V  = 1 << 18;
1247         int av_unused A1, A2;
1248         const void *r, *g, *b;
1249
1250         for (j = 0; j < lumFilterSize; j++) {
1251             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1252             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1253         }
1254         for (j = 0; j < chrFilterSize; j++) {
1255             U += chrUSrc[j][i] * chrFilter[j];
1256             V += chrVSrc[j][i] * chrFilter[j];
1257         }
1258         Y1 >>= 19;
1259         Y2 >>= 19;
1260         U  >>= 19;
1261         V  >>= 19;
1262         if ((Y1 | Y2 | U | V) & 0x100) {
1263             Y1 = av_clip_uint8(Y1);
1264             Y2 = av_clip_uint8(Y2);
1265             U  = av_clip_uint8(U);
1266             V  = av_clip_uint8(V);
1267         }
1268         if (hasAlpha) {
1269             A1 = 1 << 18;
1270             A2 = 1 << 18;
1271             for (j = 0; j < lumFilterSize; j++) {
1272                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1273                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1274             }
1275             A1 >>= 19;
1276             A2 >>= 19;
1277             if ((A1 | A2) & 0x100) {
1278                 A1 = av_clip_uint8(A1);
1279                 A2 = av_clip_uint8(A2);
1280             }
1281         }
1282
1283         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1284         r =  c->table_rV[V];
1285         g = (c->table_gU[U] + c->table_gV[V]);
1286         b =  c->table_bU[U];
1287
1288         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1289                       r, g, b, y, target, hasAlpha);
1290     }
1291 }
1292
1293 static av_always_inline void
1294 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1295                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1296                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1297                      int yalpha, int uvalpha, int y,
1298                      enum PixelFormat target, int hasAlpha)
1299 {
1300     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1301                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1302                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1303                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1304                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1305     int  yalpha1 = 4095 - yalpha;
1306     int uvalpha1 = 4095 - uvalpha;
1307     int i;
1308
1309     for (i = 0; i < (dstW >> 1); i++) {
1310         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1311         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1312         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1313         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1314         int A1, A2;
1315         const void *r =  c->table_rV[V],
1316                    *g = (c->table_gU[U] + c->table_gV[V]),
1317                    *b =  c->table_bU[U];
1318
1319         if (hasAlpha) {
1320             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1321             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1322         }
1323
1324         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1325                       r, g, b, y, target, hasAlpha);
1326     }
1327 }
1328
1329 static av_always_inline void
1330 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1331                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1332                      const int16_t *abuf0, uint8_t *dest, int dstW,
1333                      int uvalpha, int y, enum PixelFormat target,
1334                      int hasAlpha)
1335 {
1336     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1337                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1338     int i;
1339
1340     if (uvalpha < 2048) {
1341         for (i = 0; i < (dstW >> 1); i++) {
1342             int Y1 = buf0[i * 2]     >> 7;
1343             int Y2 = buf0[i * 2 + 1] >> 7;
1344             int U  = ubuf1[i]        >> 7;
1345             int V  = vbuf1[i]        >> 7;
1346             int A1, A2;
1347             const void *r =  c->table_rV[V],
1348                        *g = (c->table_gU[U] + c->table_gV[V]),
1349                        *b =  c->table_bU[U];
1350
1351             if (hasAlpha) {
1352                 A1 = abuf0[i * 2    ] >> 7;
1353                 A2 = abuf0[i * 2 + 1] >> 7;
1354             }
1355
1356             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1357                           r, g, b, y, target, hasAlpha);
1358         }
1359     } else {
1360         for (i = 0; i < (dstW >> 1); i++) {
1361             int Y1 =  buf0[i * 2]          >> 7;
1362             int Y2 =  buf0[i * 2 + 1]      >> 7;
1363             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1364             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1365             int A1, A2;
1366             const void *r =  c->table_rV[V],
1367                        *g = (c->table_gU[U] + c->table_gV[V]),
1368                        *b =  c->table_bU[U];
1369
1370             if (hasAlpha) {
1371                 A1 = abuf0[i * 2    ] >> 7;
1372                 A2 = abuf0[i * 2 + 1] >> 7;
1373             }
1374
1375             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1376                           r, g, b, y, target, hasAlpha);
1377         }
1378     }
1379 }
1380
1381 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1382 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1383                                 const int16_t **lumSrc, int lumFilterSize, \
1384                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1385                                 const int16_t **chrVSrc, int chrFilterSize, \
1386                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1387                                 int y) \
1388 { \
1389     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1390                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1391                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1392 }
1393 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1394 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1395 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1396                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1397                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1398                                 int yalpha, int uvalpha, int y) \
1399 { \
1400     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1401                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1402 } \
1403  \
1404 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1405                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1406                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1407                                 int uvalpha, int y) \
1408 { \
1409     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1410                                   dstW, uvalpha, y, fmt, hasAlpha); \
1411 }
1412
1413 #if CONFIG_SMALL
1414 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1415 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1416 #else
1417 #if CONFIG_SWSCALE_ALPHA
1418 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1419 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1420 #endif
1421 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1422 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1423 #endif
1424 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1425 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1426 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1427 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1428 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1429 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1430 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1431 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1432
1433 static av_always_inline void
1434 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1435                           const int16_t **lumSrc, int lumFilterSize,
1436                           const int16_t *chrFilter, const int16_t **chrUSrc,
1437                           const int16_t **chrVSrc, int chrFilterSize,
1438                           const int16_t **alpSrc, uint8_t *dest,
1439                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1440 {
1441     int i;
1442     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1443
1444     for (i = 0; i < dstW; i++) {
1445         int j;
1446         int Y = 1<<9;
1447         int U = (1<<9)-(128 << 19);
1448         int V = (1<<9)-(128 << 19);
1449         int av_unused A;
1450         int R, G, B;
1451
1452         for (j = 0; j < lumFilterSize; j++) {
1453             Y += lumSrc[j][i] * lumFilter[j];
1454         }
1455         for (j = 0; j < chrFilterSize; j++) {
1456             U += chrUSrc[j][i] * chrFilter[j];
1457             V += chrVSrc[j][i] * chrFilter[j];
1458         }
1459         Y >>= 10;
1460         U >>= 10;
1461         V >>= 10;
1462         if (hasAlpha) {
1463             A = 1 << 18;
1464             for (j = 0; j < lumFilterSize; j++) {
1465                 A += alpSrc[j][i] * lumFilter[j];
1466             }
1467             A >>= 19;
1468             if (A & 0x100)
1469                 A = av_clip_uint8(A);
1470         }
1471         Y -= c->yuv2rgb_y_offset;
1472         Y *= c->yuv2rgb_y_coeff;
1473         Y += 1 << 21;
1474         R = Y + V*c->yuv2rgb_v2r_coeff;
1475         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1476         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1477         if ((R | G | B) & 0xC0000000) {
1478             R = av_clip_uintp2(R, 30);
1479             G = av_clip_uintp2(G, 30);
1480             B = av_clip_uintp2(B, 30);
1481         }
1482
1483         switch(target) {
1484         case PIX_FMT_ARGB:
1485             dest[0] = hasAlpha ? A : 255;
1486             dest[1] = R >> 22;
1487             dest[2] = G >> 22;
1488             dest[3] = B >> 22;
1489             break;
1490         case PIX_FMT_RGB24:
1491             dest[0] = R >> 22;
1492             dest[1] = G >> 22;
1493             dest[2] = B >> 22;
1494             break;
1495         case PIX_FMT_RGBA:
1496             dest[0] = R >> 22;
1497             dest[1] = G >> 22;
1498             dest[2] = B >> 22;
1499             dest[3] = hasAlpha ? A : 255;
1500             break;
1501         case PIX_FMT_ABGR:
1502             dest[0] = hasAlpha ? A : 255;
1503             dest[1] = B >> 22;
1504             dest[2] = G >> 22;
1505             dest[3] = R >> 22;
1506             break;
1507         case PIX_FMT_BGR24:
1508             dest[0] = B >> 22;
1509             dest[1] = G >> 22;
1510             dest[2] = R >> 22;
1511             break;
1512         case PIX_FMT_BGRA:
1513             dest[0] = B >> 22;
1514             dest[1] = G >> 22;
1515             dest[2] = R >> 22;
1516             dest[3] = hasAlpha ? A : 255;
1517             break;
1518         }
1519         dest += step;
1520     }
1521 }
1522
1523 #if CONFIG_SMALL
1524 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1525 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1526 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1527 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1528 #else
1529 #if CONFIG_SWSCALE_ALPHA
1530 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1531 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1532 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1533 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1534 #endif
1535 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1536 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1537 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1538 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1539 #endif
1540 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1541 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1542
1543 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1544                                        int width, int height,
1545                                        int y, uint8_t val)
1546 {
1547     int i;
1548     uint8_t *ptr = plane + stride*y;
1549     for (i=0; i<height; i++) {
1550         memset(ptr, val, width);
1551         ptr += stride;
1552     }
1553 }
1554
1555 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1556
1557 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1558 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1559
1560 static av_always_inline void
1561 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1562                     enum PixelFormat origin)
1563 {
1564     int i;
1565     for (i = 0; i < width; i++) {
1566         unsigned int r_b = input_pixel(&src[i*3+0]);
1567         unsigned int   g = input_pixel(&src[i*3+1]);
1568         unsigned int b_r = input_pixel(&src[i*3+2]);
1569
1570         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1571     }
1572 }
1573
1574 static av_always_inline void
1575 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1576                     const uint16_t *src1, const uint16_t *src2,
1577                     int width, enum PixelFormat origin)
1578 {
1579     int i;
1580     assert(src1==src2);
1581     for (i = 0; i < width; i++) {
1582         int r_b = input_pixel(&src1[i*3+0]);
1583         int   g = input_pixel(&src1[i*3+1]);
1584         int b_r = input_pixel(&src1[i*3+2]);
1585
1586         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1587         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1588     }
1589 }
1590
1591 static av_always_inline void
1592 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1593                           const uint16_t *src1, const uint16_t *src2,
1594                           int width, enum PixelFormat origin)
1595 {
1596     int i;
1597     assert(src1==src2);
1598     for (i = 0; i < width; i++) {
1599         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1600         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1601         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1602
1603         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1604         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1605     }
1606 }
1607
1608 #undef r
1609 #undef b
1610 #undef input_pixel
1611
1612 #define rgb48funcs(pattern, BE_LE, origin) \
1613 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1614                                     int width, uint32_t *unused) \
1615 { \
1616     const uint16_t *src = (const uint16_t *) _src; \
1617     uint16_t *dst = (uint16_t *) _dst; \
1618     rgb48ToY_c_template(dst, src, width, origin); \
1619 } \
1620  \
1621 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1622                                     const uint8_t *_src1, const uint8_t *_src2, \
1623                                     int width, uint32_t *unused) \
1624 { \
1625     const uint16_t *src1 = (const uint16_t *) _src1, \
1626                    *src2 = (const uint16_t *) _src2; \
1627     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1628     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1629 } \
1630  \
1631 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1632                                     const uint8_t *_src1, const uint8_t *_src2, \
1633                                     int width, uint32_t *unused) \
1634 { \
1635     const uint16_t *src1 = (const uint16_t *) _src1, \
1636                    *src2 = (const uint16_t *) _src2; \
1637     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1638     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1639 }
1640
1641 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1642 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1643 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1644 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1645
1646 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1647                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1648                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1649
1650 static av_always_inline void
1651 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1652                        int width, enum PixelFormat origin,
1653                        int shr,   int shg,   int shb, int shp,
1654                        int maskr, int maskg, int maskb,
1655                        int rsh,   int gsh,   int bsh, int S)
1656 {
1657     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1658               rnd = (32<<((S)-1)) + (1<<(S-7));
1659     int i;
1660
1661     for (i = 0; i < width; i++) {
1662         int px = input_pixel(i) >> shp;
1663         int b = (px & maskb) >> shb;
1664         int g = (px & maskg) >> shg;
1665         int r = (px & maskr) >> shr;
1666
1667         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1668     }
1669 }
1670
1671 static av_always_inline void
1672 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1673                         const uint8_t *src, int width,
1674                         enum PixelFormat origin,
1675                         int shr,   int shg,   int shb, int shp,
1676                         int maskr, int maskg, int maskb,
1677                         int rsh,   int gsh,   int bsh, int S)
1678 {
1679     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1680               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1681               rnd = (256<<((S)-1)) + (1<<(S-7));
1682     int i;
1683
1684     for (i = 0; i < width; i++) {
1685         int px = input_pixel(i) >> shp;
1686         int b = (px & maskb) >> shb;
1687         int g = (px & maskg) >> shg;
1688         int r = (px & maskr) >> shr;
1689
1690         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1691         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1692     }
1693 }
1694
1695 static av_always_inline void
1696 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1697                              const uint8_t *src, int width,
1698                              enum PixelFormat origin,
1699                              int shr,   int shg,   int shb, int shp,
1700                              int maskr, int maskg, int maskb,
1701                              int rsh,   int gsh,   int bsh, int S)
1702 {
1703     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1704               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1705               rnd = (256U<<(S)) + (1<<(S-6)), maskgx = ~(maskr | maskb);
1706     int i;
1707
1708     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1709     for (i = 0; i < width; i++) {
1710         int px0 = input_pixel(2 * i + 0) >> shp;
1711         int px1 = input_pixel(2 * i + 1) >> shp;
1712         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1713         int rb = px0 + px1 - g;
1714
1715         b = (rb & maskb) >> shb;
1716         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1717             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1718             g >>= shg;
1719         } else {
1720             g = (g  & maskg) >> shg;
1721         }
1722         r = (rb & maskr) >> shr;
1723
1724         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1725         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1726     }
1727 }
1728
1729 #undef input_pixel
1730
1731 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1732                          maskg, maskb, rsh, gsh, bsh, S) \
1733 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1734                           int width, uint32_t *unused) \
1735 { \
1736     rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, \
1737                            shr, shg, shb, shp, \
1738                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1739 } \
1740  \
1741 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1742                            const uint8_t *src, const uint8_t *dummy, \
1743                            int width, uint32_t *unused) \
1744 { \
1745     rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt,  \
1746                             shr, shg, shb, shp, \
1747                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1748 } \
1749  \
1750 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1751                                 const uint8_t *src, const uint8_t *dummy, \
1752                                 int width, uint32_t *unused) \
1753 { \
1754     rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
1755                                  shr, shg, shb, shp, \
1756                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1757 }
1758
1759 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1760 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1761 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1762 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1763 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1764 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1765 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1766 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1767 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1768 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1769 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1770 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1771
1772 static void abgrToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1773 {
1774     int i;
1775     for (i=0; i<width; i++) {
1776         dst[i]= src[4*i]<<6;
1777     }
1778 }
1779
1780 static void rgbaToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1781 {
1782     int i;
1783     for (i=0; i<width; i++) {
1784         dst[i]= src[4*i+3]<<6;
1785     }
1786 }
1787
1788 static void palToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *pal)
1789 {
1790     int i;
1791     for (i=0; i<width; i++) {
1792         int d= src[i];
1793
1794         dst[i]= (pal[d] >> 24)<<6;
1795     }
1796 }
1797
1798 static void palToY_c(int16_t *dst, const uint8_t *src, long width, uint32_t *pal)
1799 {
1800     int i;
1801     for (i=0; i<width; i++) {
1802         int d= src[i];
1803
1804         dst[i]= (pal[d] & 0xFF)<<6;
1805     }
1806 }
1807
1808 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1809                            const uint8_t *src1, const uint8_t *src2,
1810                            int width, uint32_t *pal)
1811 {
1812     int i;
1813     assert(src1 == src2);
1814     for (i=0; i<width; i++) {
1815         int p= pal[src1[i]];
1816
1817         dstU[i]= (uint8_t)(p>> 8)<<6;
1818         dstV[i]= (uint8_t)(p>>16)<<6;
1819     }
1820 }
1821
1822 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1823 {
1824     int i, j;
1825     for (i=0; i<width/8; i++) {
1826         int d= ~src[i];
1827         for(j=0; j<8; j++)
1828             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1829     }
1830     if(width&7){
1831         int d= ~src[i];
1832         for(j=0; j<(width&7); j++)
1833             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1834     }
1835 }
1836
1837 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1838 {
1839     int i, j;
1840     for (i=0; i<width/8; i++) {
1841         int d= src[i];
1842         for(j=0; j<8; j++)
1843             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1844     }
1845     if(width&7){
1846         int d= src[i];
1847         for(j=0; j<(width&7); j++)
1848             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1849     }
1850 }
1851
1852 //FIXME yuy2* can read up to 7 samples too much
1853
1854 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1855                       uint32_t *unused)
1856 {
1857     int i;
1858     for (i=0; i<width; i++)
1859         dst[i]= src[2*i];
1860 }
1861
1862 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1863                        const uint8_t *src2, int width, uint32_t *unused)
1864 {
1865     int i;
1866     for (i=0; i<width; i++) {
1867         dstU[i]= src1[4*i + 1];
1868         dstV[i]= src1[4*i + 3];
1869     }
1870     assert(src1 == src2);
1871 }
1872
1873 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1874 {
1875     int i;
1876     const uint16_t *src = (const uint16_t *) _src;
1877     uint16_t *dst = (uint16_t *) _dst;
1878     for (i=0; i<width; i++) {
1879         dst[i] = av_bswap16(src[i]);
1880     }
1881 }
1882
1883 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1884                         const uint8_t *_src2, int width, uint32_t *unused)
1885 {
1886     int i;
1887     const uint16_t *src1 = (const uint16_t *) _src1,
1888                    *src2 = (const uint16_t *) _src2;
1889     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1890     for (i=0; i<width; i++) {
1891         dstU[i] = av_bswap16(src1[i]);
1892         dstV[i] = av_bswap16(src2[i]);
1893     }
1894 }
1895
1896 /* This is almost identical to the previous, end exists only because
1897  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1898 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1899                       uint32_t *unused)
1900 {
1901     int i;
1902     for (i=0; i<width; i++)
1903         dst[i]= src[2*i+1];
1904 }
1905
1906 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1907                        const uint8_t *src2, int width, uint32_t *unused)
1908 {
1909     int i;
1910     for (i=0; i<width; i++) {
1911         dstU[i]= src1[4*i + 0];
1912         dstV[i]= src1[4*i + 2];
1913     }
1914     assert(src1 == src2);
1915 }
1916
1917 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1918                                         const uint8_t *src, int width)
1919 {
1920     int i;
1921     for (i = 0; i < width; i++) {
1922         dst1[i] = src[2*i+0];
1923         dst2[i] = src[2*i+1];
1924     }
1925 }
1926
1927 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1928                        const uint8_t *src1, const uint8_t *src2,
1929                        int width, uint32_t *unused)
1930 {
1931     nvXXtoUV_c(dstU, dstV, src1, width);
1932 }
1933
1934 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1935                        const uint8_t *src1, const uint8_t *src2,
1936                        int width, uint32_t *unused)
1937 {
1938     nvXXtoUV_c(dstV, dstU, src1, width);
1939 }
1940
1941 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1942
1943 static void bgr24ToY_c(int16_t *dst, const uint8_t *src,
1944                        int width, uint32_t *unused)
1945 {
1946     int i;
1947     for (i=0; i<width; i++) {
1948         int b= src[i*3+0];
1949         int g= src[i*3+1];
1950         int r= src[i*3+2];
1951
1952         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1953     }
1954 }
1955
1956 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1957                         const uint8_t *src2, int width, uint32_t *unused)
1958 {
1959     int i;
1960     for (i=0; i<width; i++) {
1961         int b= src1[3*i + 0];
1962         int g= src1[3*i + 1];
1963         int r= src1[3*i + 2];
1964
1965         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1966         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1967     }
1968     assert(src1 == src2);
1969 }
1970
1971 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1972                              const uint8_t *src2, int width, uint32_t *unused)
1973 {
1974     int i;
1975     for (i=0; i<width; i++) {
1976         int b= src1[6*i + 0] + src1[6*i + 3];
1977         int g= src1[6*i + 1] + src1[6*i + 4];
1978         int r= src1[6*i + 2] + src1[6*i + 5];
1979
1980         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1981         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1982     }
1983     assert(src1 == src2);
1984 }
1985
1986 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, int width,
1987                        uint32_t *unused)
1988 {
1989     int i;
1990     for (i=0; i<width; i++) {
1991         int r= src[i*3+0];
1992         int g= src[i*3+1];
1993         int b= src[i*3+2];
1994
1995         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1996     }
1997 }
1998
1999 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
2000                         const uint8_t *src2, int width, uint32_t *unused)
2001 {
2002     int i;
2003     assert(src1==src2);
2004     for (i=0; i<width; i++) {
2005         int r= src1[3*i + 0];
2006         int g= src1[3*i + 1];
2007         int b= src1[3*i + 2];
2008
2009         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
2010         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
2011     }
2012 }
2013
2014 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
2015                                     const uint8_t *src2, int width, uint32_t *unused)
2016 {
2017     int i;
2018     assert(src1==src2);
2019     for (i=0; i<width; i++) {
2020         int r= src1[6*i + 0] + src1[6*i + 3];
2021         int g= src1[6*i + 1] + src1[6*i + 4];
2022         int b= src1[6*i + 2] + src1[6*i + 5];
2023
2024         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
2025         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
2026     }
2027 }
2028
2029 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
2030                            const int16_t *filter,
2031                            const int16_t *filterPos, int filterSize)
2032 {
2033     int i;
2034     int32_t *dst = (int32_t *) _dst;
2035     const uint16_t *src = (const uint16_t *) _src;
2036     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2037     int sh = bits - 4;
2038
2039     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2040         sh= 9;
2041
2042     for (i = 0; i < dstW; i++) {
2043         int j;
2044         int srcPos = filterPos[i];
2045         int val = 0;
2046
2047         for (j = 0; j < filterSize; j++) {
2048             val += src[srcPos + j] * filter[filterSize * i + j];
2049         }
2050         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
2051         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
2052     }
2053 }
2054
2055 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
2056                            const int16_t *filter,
2057                            const int16_t *filterPos, int filterSize)
2058 {
2059     int i;
2060     const uint16_t *src = (const uint16_t *) _src;
2061     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2062
2063     if(sh<15)
2064         sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2065
2066     for (i = 0; i < dstW; i++) {
2067         int j;
2068         int srcPos = filterPos[i];
2069         int val = 0;
2070
2071         for (j = 0; j < filterSize; j++) {
2072             val += src[srcPos + j] * filter[filterSize * i + j];
2073         }
2074         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2075         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2076     }
2077 }
2078
2079 // bilinear / bicubic scaling
2080 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2081                           const int16_t *filter, const int16_t *filterPos,
2082                           int filterSize)
2083 {
2084     int i;
2085     for (i=0; i<dstW; i++) {
2086         int j;
2087         int srcPos= filterPos[i];
2088         int val=0;
2089         for (j=0; j<filterSize; j++) {
2090             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2091         }
2092         //filter += hFilterSize;
2093         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2094         //dst[i] = val>>7;
2095     }
2096 }
2097
2098 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
2099                           const int16_t *filter, const int16_t *filterPos,
2100                           int filterSize)
2101 {
2102     int i;
2103     int32_t *dst = (int32_t *) _dst;
2104     for (i=0; i<dstW; i++) {
2105         int j;
2106         int srcPos= filterPos[i];
2107         int val=0;
2108         for (j=0; j<filterSize; j++) {
2109             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2110         }
2111         //filter += hFilterSize;
2112         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
2113         //dst[i] = val>>7;
2114     }
2115 }
2116
2117 //FIXME all pal and rgb srcFormats could do this convertion as well
2118 //FIXME all scalers more complex than bilinear could do half of this transform
2119 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2120 {
2121     int i;
2122     for (i = 0; i < width; i++) {
2123         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2124         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2125     }
2126 }
2127 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2128 {
2129     int i;
2130     for (i = 0; i < width; i++) {
2131         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2132         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2133     }
2134 }
2135 static void lumRangeToJpeg_c(int16_t *dst, int width)
2136 {
2137     int i;
2138     for (i = 0; i < width; i++)
2139         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2140 }
2141 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2142 {
2143     int i;
2144     for (i = 0; i < width; i++)
2145         dst[i] = (dst[i]*14071 + 33561947)>>14;
2146 }
2147
2148 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2149 {
2150     int i;
2151     int32_t *dstU = (int32_t *) _dstU;
2152     int32_t *dstV = (int32_t *) _dstV;
2153     for (i = 0; i < width; i++) {
2154         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2155         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2156     }
2157 }
2158 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2159 {
2160     int i;
2161     int32_t *dstU = (int32_t *) _dstU;
2162     int32_t *dstV = (int32_t *) _dstV;
2163     for (i = 0; i < width; i++) {
2164         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2165         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2166     }
2167 }
2168 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2169 {
2170     int i;
2171     int32_t *dst = (int32_t *) _dst;
2172     for (i = 0; i < width; i++)
2173         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2174 }
2175 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2176 {
2177     int i;
2178     int32_t *dst = (int32_t *) _dst;
2179     for (i = 0; i < width; i++)
2180         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2181 }
2182
2183 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2184                            const uint8_t *src, int srcW, int xInc)
2185 {
2186     int i;
2187     unsigned int xpos=0;
2188     for (i=0;i<dstWidth;i++) {
2189         register unsigned int xx=xpos>>16;
2190         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2191         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2192         xpos+=xInc;
2193     }
2194     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2195         dst[i] = src[srcW-1]*128;
2196 }
2197
2198 // *** horizontal scale Y line to temp buffer
2199 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2200                                      const uint8_t *src, int srcW, int xInc,
2201                                      const int16_t *hLumFilter,
2202                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2203                                      uint8_t *formatConvBuffer,
2204                                      uint32_t *pal, int isAlpha)
2205 {
2206     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2207     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2208
2209     if (toYV12) {
2210         toYV12(formatConvBuffer, src, srcW, pal);
2211         src= formatConvBuffer;
2212     }
2213
2214     if (c->hScale16) {
2215         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2216         c->hScale16(dst, dstWidth, (const uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, shift);
2217     } else if (!c->hyscale_fast) {
2218         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2219     } else { // fast bilinear upscale / crap downscale
2220         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2221     }
2222
2223     if (convertRange)
2224         convertRange(dst, dstWidth);
2225 }
2226
2227 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2228                            int dstWidth, const uint8_t *src1,
2229                            const uint8_t *src2, int srcW, int xInc)
2230 {
2231     int i;
2232     unsigned int xpos=0;
2233     for (i=0;i<dstWidth;i++) {
2234         register unsigned int xx=xpos>>16;
2235         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2236         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2237         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2238         xpos+=xInc;
2239     }
2240     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2241         dst1[i] = src1[srcW-1]*128;
2242         dst2[i] = src2[srcW-1]*128;
2243     }
2244 }
2245
2246 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2247                                      const uint8_t *src1, const uint8_t *src2,
2248                                      int srcW, int xInc, const int16_t *hChrFilter,
2249                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2250                                      uint8_t *formatConvBuffer, uint32_t *pal)
2251 {
2252     if (c->chrToYV12) {
2253         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2254         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2255         src1= formatConvBuffer;
2256         src2= buf2;
2257     }
2258
2259     if (c->hScale16) {
2260         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2261         c->hScale16(dst1, dstWidth, (const uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2262         c->hScale16(dst2, dstWidth, (const uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2263     } else if (!c->hcscale_fast) {
2264         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2265         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2266     } else { // fast bilinear upscale / crap downscale
2267         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2268     }
2269
2270     if (c->chrConvertRange)
2271         c->chrConvertRange(dst1, dst2, dstWidth);
2272 }
2273
2274 static av_always_inline void
2275 find_c_packed_planar_out_funcs(SwsContext *c,
2276                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2277                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2278                                yuv2packedX_fn *yuv2packedX)
2279 {
2280     enum PixelFormat dstFormat = c->dstFormat;
2281
2282     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2283         *yuv2yuvX     = yuv2nv12X_c;
2284     } else if (is16BPS(dstFormat)) {
2285         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2286     } else if (is9_OR_10BPS(dstFormat)) {
2287         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2288             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2289         } else {
2290             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2291         }
2292     } else {
2293         *yuv2yuv1     = yuv2yuv1_c;
2294         *yuv2yuvX     = yuv2yuvX_c;
2295     }
2296     if(c->flags & SWS_FULL_CHR_H_INT) {
2297         switch (dstFormat) {
2298             case PIX_FMT_RGBA:
2299 #if CONFIG_SMALL
2300                 *yuv2packedX = yuv2rgba32_full_X_c;
2301 #else
2302 #if CONFIG_SWSCALE_ALPHA
2303                 if (c->alpPixBuf) {
2304                     *yuv2packedX = yuv2rgba32_full_X_c;
2305                 } else
2306 #endif /* CONFIG_SWSCALE_ALPHA */
2307                 {
2308                     *yuv2packedX = yuv2rgbx32_full_X_c;
2309                 }
2310 #endif /* !CONFIG_SMALL */
2311                 break;
2312             case PIX_FMT_ARGB:
2313 #if CONFIG_SMALL
2314                 *yuv2packedX = yuv2argb32_full_X_c;
2315 #else
2316 #if CONFIG_SWSCALE_ALPHA
2317                 if (c->alpPixBuf) {
2318                     *yuv2packedX = yuv2argb32_full_X_c;
2319                 } else
2320 #endif /* CONFIG_SWSCALE_ALPHA */
2321                 {
2322                     *yuv2packedX = yuv2xrgb32_full_X_c;
2323                 }
2324 #endif /* !CONFIG_SMALL */
2325                 break;
2326             case PIX_FMT_BGRA:
2327 #if CONFIG_SMALL
2328                 *yuv2packedX = yuv2bgra32_full_X_c;
2329 #else
2330 #if CONFIG_SWSCALE_ALPHA
2331                 if (c->alpPixBuf) {
2332                     *yuv2packedX = yuv2bgra32_full_X_c;
2333                 } else
2334 #endif /* CONFIG_SWSCALE_ALPHA */
2335                 {
2336                     *yuv2packedX = yuv2bgrx32_full_X_c;
2337                 }
2338 #endif /* !CONFIG_SMALL */
2339                 break;
2340             case PIX_FMT_ABGR:
2341 #if CONFIG_SMALL
2342                 *yuv2packedX = yuv2abgr32_full_X_c;
2343 #else
2344 #if CONFIG_SWSCALE_ALPHA
2345                 if (c->alpPixBuf) {
2346                     *yuv2packedX = yuv2abgr32_full_X_c;
2347                 } else
2348 #endif /* CONFIG_SWSCALE_ALPHA */
2349                 {
2350                     *yuv2packedX = yuv2xbgr32_full_X_c;
2351                 }
2352 #endif /* !CONFIG_SMALL */
2353                 break;
2354             case PIX_FMT_RGB24:
2355             *yuv2packedX = yuv2rgb24_full_X_c;
2356             break;
2357         case PIX_FMT_BGR24:
2358             *yuv2packedX = yuv2bgr24_full_X_c;
2359             break;
2360         }
2361         if(!*yuv2packedX)
2362             goto YUV_PACKED;
2363     } else {
2364         YUV_PACKED:
2365         switch (dstFormat) {
2366         case PIX_FMT_GRAY16BE:
2367             *yuv2packed1 = yuv2gray16BE_1_c;
2368             *yuv2packed2 = yuv2gray16BE_2_c;
2369             *yuv2packedX = yuv2gray16BE_X_c;
2370             break;
2371         case PIX_FMT_GRAY16LE:
2372             *yuv2packed1 = yuv2gray16LE_1_c;
2373             *yuv2packed2 = yuv2gray16LE_2_c;
2374             *yuv2packedX = yuv2gray16LE_X_c;
2375             break;
2376         case PIX_FMT_MONOWHITE:
2377             *yuv2packed1 = yuv2monowhite_1_c;
2378             *yuv2packed2 = yuv2monowhite_2_c;
2379             *yuv2packedX = yuv2monowhite_X_c;
2380             break;
2381         case PIX_FMT_MONOBLACK:
2382             *yuv2packed1 = yuv2monoblack_1_c;
2383             *yuv2packed2 = yuv2monoblack_2_c;
2384             *yuv2packedX = yuv2monoblack_X_c;
2385             break;
2386         case PIX_FMT_YUYV422:
2387             *yuv2packed1 = yuv2yuyv422_1_c;
2388             *yuv2packed2 = yuv2yuyv422_2_c;
2389             *yuv2packedX = yuv2yuyv422_X_c;
2390             break;
2391         case PIX_FMT_UYVY422:
2392             *yuv2packed1 = yuv2uyvy422_1_c;
2393             *yuv2packed2 = yuv2uyvy422_2_c;
2394             *yuv2packedX = yuv2uyvy422_X_c;
2395             break;
2396         case PIX_FMT_RGB48LE:
2397             *yuv2packed1 = yuv2rgb48le_1_c;
2398             *yuv2packed2 = yuv2rgb48le_2_c;
2399             *yuv2packedX = yuv2rgb48le_X_c;
2400             break;
2401         case PIX_FMT_RGB48BE:
2402             *yuv2packed1 = yuv2rgb48be_1_c;
2403             *yuv2packed2 = yuv2rgb48be_2_c;
2404             *yuv2packedX = yuv2rgb48be_X_c;
2405             break;
2406         case PIX_FMT_BGR48LE:
2407             *yuv2packed1 = yuv2bgr48le_1_c;
2408             *yuv2packed2 = yuv2bgr48le_2_c;
2409             *yuv2packedX = yuv2bgr48le_X_c;
2410             break;
2411         case PIX_FMT_BGR48BE:
2412             *yuv2packed1 = yuv2bgr48be_1_c;
2413             *yuv2packed2 = yuv2bgr48be_2_c;
2414             *yuv2packedX = yuv2bgr48be_X_c;
2415             break;
2416         case PIX_FMT_RGB32:
2417         case PIX_FMT_BGR32:
2418 #if CONFIG_SMALL
2419             *yuv2packed1 = yuv2rgb32_1_c;
2420             *yuv2packed2 = yuv2rgb32_2_c;
2421             *yuv2packedX = yuv2rgb32_X_c;
2422 #else
2423 #if CONFIG_SWSCALE_ALPHA
2424                 if (c->alpPixBuf) {
2425                     *yuv2packed1 = yuv2rgba32_1_c;
2426                     *yuv2packed2 = yuv2rgba32_2_c;
2427                     *yuv2packedX = yuv2rgba32_X_c;
2428                 } else
2429 #endif /* CONFIG_SWSCALE_ALPHA */
2430                 {
2431                     *yuv2packed1 = yuv2rgbx32_1_c;
2432                     *yuv2packed2 = yuv2rgbx32_2_c;
2433                     *yuv2packedX = yuv2rgbx32_X_c;
2434                 }
2435 #endif /* !CONFIG_SMALL */
2436             break;
2437         case PIX_FMT_RGB32_1:
2438         case PIX_FMT_BGR32_1:
2439 #if CONFIG_SMALL
2440                 *yuv2packed1 = yuv2rgb32_1_1_c;
2441                 *yuv2packed2 = yuv2rgb32_1_2_c;
2442                 *yuv2packedX = yuv2rgb32_1_X_c;
2443 #else
2444 #if CONFIG_SWSCALE_ALPHA
2445                 if (c->alpPixBuf) {
2446                     *yuv2packed1 = yuv2rgba32_1_1_c;
2447                     *yuv2packed2 = yuv2rgba32_1_2_c;
2448                     *yuv2packedX = yuv2rgba32_1_X_c;
2449                 } else
2450 #endif /* CONFIG_SWSCALE_ALPHA */
2451                 {
2452                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2453                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2454                     *yuv2packedX = yuv2rgbx32_1_X_c;
2455                 }
2456 #endif /* !CONFIG_SMALL */
2457                 break;
2458         case PIX_FMT_RGB24:
2459             *yuv2packed1 = yuv2rgb24_1_c;
2460             *yuv2packed2 = yuv2rgb24_2_c;
2461             *yuv2packedX = yuv2rgb24_X_c;
2462             break;
2463         case PIX_FMT_BGR24:
2464             *yuv2packed1 = yuv2bgr24_1_c;
2465             *yuv2packed2 = yuv2bgr24_2_c;
2466             *yuv2packedX = yuv2bgr24_X_c;
2467             break;
2468         case PIX_FMT_RGB565LE:
2469         case PIX_FMT_RGB565BE:
2470         case PIX_FMT_BGR565LE:
2471         case PIX_FMT_BGR565BE:
2472             *yuv2packed1 = yuv2rgb16_1_c;
2473             *yuv2packed2 = yuv2rgb16_2_c;
2474             *yuv2packedX = yuv2rgb16_X_c;
2475             break;
2476         case PIX_FMT_RGB555LE:
2477         case PIX_FMT_RGB555BE:
2478         case PIX_FMT_BGR555LE:
2479         case PIX_FMT_BGR555BE:
2480             *yuv2packed1 = yuv2rgb15_1_c;
2481             *yuv2packed2 = yuv2rgb15_2_c;
2482             *yuv2packedX = yuv2rgb15_X_c;
2483             break;
2484         case PIX_FMT_RGB444LE:
2485         case PIX_FMT_RGB444BE:
2486         case PIX_FMT_BGR444LE:
2487         case PIX_FMT_BGR444BE:
2488             *yuv2packed1 = yuv2rgb12_1_c;
2489             *yuv2packed2 = yuv2rgb12_2_c;
2490             *yuv2packedX = yuv2rgb12_X_c;
2491             break;
2492         case PIX_FMT_RGB8:
2493         case PIX_FMT_BGR8:
2494             *yuv2packed1 = yuv2rgb8_1_c;
2495             *yuv2packed2 = yuv2rgb8_2_c;
2496             *yuv2packedX = yuv2rgb8_X_c;
2497             break;
2498         case PIX_FMT_RGB4:
2499         case PIX_FMT_BGR4:
2500             *yuv2packed1 = yuv2rgb4_1_c;
2501             *yuv2packed2 = yuv2rgb4_2_c;
2502             *yuv2packedX = yuv2rgb4_X_c;
2503             break;
2504         case PIX_FMT_RGB4_BYTE:
2505         case PIX_FMT_BGR4_BYTE:
2506             *yuv2packed1 = yuv2rgb4b_1_c;
2507             *yuv2packed2 = yuv2rgb4b_2_c;
2508             *yuv2packedX = yuv2rgb4b_X_c;
2509             break;
2510         }
2511     }
2512 }
2513
2514 #define DEBUG_SWSCALE_BUFFERS 0
2515 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2516
2517 static int swScale(SwsContext *c, const uint8_t* src[],
2518                    int srcStride[], int srcSliceY,
2519                    int srcSliceH, uint8_t* dst[], int dstStride[])
2520 {
2521     /* load a few things into local vars to make the code more readable? and faster */
2522     const int srcW= c->srcW;
2523     const int dstW= c->dstW;
2524     const int dstH= c->dstH;
2525     const int chrDstW= c->chrDstW;
2526     const int chrSrcW= c->chrSrcW;
2527     const int lumXInc= c->lumXInc;
2528     const int chrXInc= c->chrXInc;
2529     const enum PixelFormat dstFormat= c->dstFormat;
2530     const int flags= c->flags;
2531     int16_t *vLumFilterPos= c->vLumFilterPos;
2532     int16_t *vChrFilterPos= c->vChrFilterPos;
2533     int16_t *hLumFilterPos= c->hLumFilterPos;
2534     int16_t *hChrFilterPos= c->hChrFilterPos;
2535     int16_t *vLumFilter= c->vLumFilter;
2536     int16_t *vChrFilter= c->vChrFilter;
2537     int16_t *hLumFilter= c->hLumFilter;
2538     int16_t *hChrFilter= c->hChrFilter;
2539     int32_t *lumMmxFilter= c->lumMmxFilter;
2540     int32_t *chrMmxFilter= c->chrMmxFilter;
2541     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2542     const int vLumFilterSize= c->vLumFilterSize;
2543     const int vChrFilterSize= c->vChrFilterSize;
2544     const int hLumFilterSize= c->hLumFilterSize;
2545     const int hChrFilterSize= c->hChrFilterSize;
2546     int16_t **lumPixBuf= c->lumPixBuf;
2547     int16_t **chrUPixBuf= c->chrUPixBuf;
2548     int16_t **chrVPixBuf= c->chrVPixBuf;
2549     int16_t **alpPixBuf= c->alpPixBuf;
2550     const int vLumBufSize= c->vLumBufSize;
2551     const int vChrBufSize= c->vChrBufSize;
2552     uint8_t *formatConvBuffer= c->formatConvBuffer;
2553     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2554     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2555     int lastDstY;
2556     uint32_t *pal=c->pal_yuv;
2557
2558     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2559     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2560     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2561     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2562     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2563     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2564
2565     /* vars which will change and which we need to store back in the context */
2566     int dstY= c->dstY;
2567     int lumBufIndex= c->lumBufIndex;
2568     int chrBufIndex= c->chrBufIndex;
2569     int lastInLumBuf= c->lastInLumBuf;
2570     int lastInChrBuf= c->lastInChrBuf;
2571
2572     if (isPacked(c->srcFormat)) {
2573         src[0]=
2574         src[1]=
2575         src[2]=
2576         src[3]= src[0];
2577         srcStride[0]=
2578         srcStride[1]=
2579         srcStride[2]=
2580         srcStride[3]= srcStride[0];
2581     }
2582     srcStride[1]<<= c->vChrDrop;
2583     srcStride[2]<<= c->vChrDrop;
2584
2585     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2586                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2587                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2588     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2589                    srcSliceY,    srcSliceH,    dstY,    dstH);
2590     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2591                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2592
2593     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2594         static int warnedAlready=0; //FIXME move this into the context perhaps
2595         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2596             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2597                    "         ->cannot do aligned memory accesses anymore\n");
2598             warnedAlready=1;
2599         }
2600     }
2601
2602     /* Note the user might start scaling the picture in the middle so this
2603        will not get executed. This is not really intended but works
2604        currently, so people might do it. */
2605     if (srcSliceY ==0) {
2606         lumBufIndex=-1;
2607         chrBufIndex=-1;
2608         dstY=0;
2609         lastInLumBuf= -1;
2610         lastInChrBuf= -1;
2611     }
2612
2613     if (!should_dither) {
2614         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2615     }
2616     lastDstY= dstY;
2617
2618     for (;dstY < dstH; dstY++) {
2619         const int chrDstY= dstY>>c->chrDstVSubSample;
2620         uint8_t *dest[4] = {
2621             dst[0] + dstStride[0] * dstY,
2622             dst[1] + dstStride[1] * chrDstY,
2623             dst[2] + dstStride[2] * chrDstY,
2624             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2625         };
2626
2627         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2628         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2629         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2630         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2631         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2632         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2633         int enough_lines;
2634
2635         //handle holes (FAST_BILINEAR & weird filters)
2636         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2637         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2638         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2639         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2640
2641         DEBUG_BUFFERS("dstY: %d\n", dstY);
2642         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2643                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2644         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2645                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2646
2647         // Do we have enough lines in this slice to output the dstY line
2648         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2649
2650         if (!enough_lines) {
2651             lastLumSrcY = srcSliceY + srcSliceH - 1;
2652             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2653             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2654                                             lastLumSrcY, lastChrSrcY);
2655         }
2656
2657         //Do horizontal scaling
2658         while(lastInLumBuf < lastLumSrcY) {
2659             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2660             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2661             lumBufIndex++;
2662             assert(lumBufIndex < 2*vLumBufSize);
2663             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2664             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2665             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2666                     hLumFilter, hLumFilterPos, hLumFilterSize,
2667                     formatConvBuffer,
2668                     pal, 0);
2669             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2670                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2671                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2672                         formatConvBuffer,
2673                         pal, 1);
2674             lastInLumBuf++;
2675             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2676                                lumBufIndex,    lastInLumBuf);
2677         }
2678         while(lastInChrBuf < lastChrSrcY) {
2679             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2680             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2681             chrBufIndex++;
2682             assert(chrBufIndex < 2*vChrBufSize);
2683             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2684             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2685             //FIXME replace parameters through context struct (some at least)
2686
2687             if (c->needs_hcscale)
2688                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2689                           chrDstW, src1, src2, chrSrcW, chrXInc,
2690                           hChrFilter, hChrFilterPos, hChrFilterSize,
2691                           formatConvBuffer, pal);
2692             lastInChrBuf++;
2693             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2694                                chrBufIndex,    lastInChrBuf);
2695         }
2696         //wrap buf index around to stay inside the ring buffer
2697         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2698         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2699         if (!enough_lines)
2700             break; //we can't output a dstY line so let's try with the next slice
2701
2702 #if HAVE_MMX
2703         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2704 #endif
2705         if (should_dither) {
2706             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2707             c->lumDither8 = dither_8x8_128[dstY & 7];
2708         }
2709         if (dstY >= dstH-2) {
2710             // hmm looks like we can't use MMX here without overwriting this array's tail
2711             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2712                                            &yuv2packed1, &yuv2packed2,
2713                                            &yuv2packedX);
2714         }
2715
2716         {
2717             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2718             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2719             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2720             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2721
2722             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2723                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2724                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2725                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2726                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2727                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2728                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2729                              dest, dstW, chrDstW);
2730                 } else { //General YV12
2731                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2732                              lumSrcPtr, vLumFilterSize,
2733                              vChrFilter + chrDstY * vChrFilterSize,
2734                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2735                              alpSrcPtr, dest, dstW, chrDstW);
2736                 }
2737             } else {
2738                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2739                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2740                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2741                     int chrAlpha = vChrFilter[2 * dstY + 1];
2742                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2743                                 alpPixBuf ? *alpSrcPtr : NULL,
2744                                 dest[0], dstW, chrAlpha, dstY);
2745                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2746                     int lumAlpha = vLumFilter[2 * dstY + 1];
2747                     int chrAlpha = vChrFilter[2 * dstY + 1];
2748                     lumMmxFilter[2] =
2749                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2750                     chrMmxFilter[2] =
2751                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2752                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2753                                 alpPixBuf ? alpSrcPtr : NULL,
2754                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2755                 } else { //general RGB
2756                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2757                                 lumSrcPtr, vLumFilterSize,
2758                                 vChrFilter + dstY * vChrFilterSize,
2759                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2760                                 alpSrcPtr, dest[0], dstW, dstY);
2761                 }
2762             }
2763         }
2764     }
2765
2766     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2767         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2768
2769 #if HAVE_MMX2
2770     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2771         __asm__ volatile("sfence":::"memory");
2772 #endif
2773     emms_c();
2774
2775     /* store changed local vars back in the context */
2776     c->dstY= dstY;
2777     c->lumBufIndex= lumBufIndex;
2778     c->chrBufIndex= chrBufIndex;
2779     c->lastInLumBuf= lastInLumBuf;
2780     c->lastInChrBuf= lastInChrBuf;
2781
2782     return dstY - lastDstY;
2783 }
2784
2785 static av_cold void sws_init_swScale_c(SwsContext *c)
2786 {
2787     enum PixelFormat srcFormat = c->srcFormat;
2788
2789     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2790                                    &c->yuv2packed1, &c->yuv2packed2,
2791                                    &c->yuv2packedX);
2792
2793     c->chrToYV12 = NULL;
2794     switch(srcFormat) {
2795         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2796         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2797         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2798         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2799         case PIX_FMT_RGB8     :
2800         case PIX_FMT_BGR8     :
2801         case PIX_FMT_PAL8     :
2802         case PIX_FMT_BGR4_BYTE:
2803         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2804 #if HAVE_BIGENDIAN
2805         case PIX_FMT_YUV444P9LE:
2806         case PIX_FMT_YUV420P9LE:
2807         case PIX_FMT_YUV422P10LE:
2808         case PIX_FMT_YUV420P10LE:
2809         case PIX_FMT_YUV444P10LE:
2810         case PIX_FMT_YUV420P16LE:
2811         case PIX_FMT_YUV422P16LE:
2812         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2813 #else
2814         case PIX_FMT_YUV444P9BE:
2815         case PIX_FMT_YUV420P9BE:
2816         case PIX_FMT_YUV444P10BE:
2817         case PIX_FMT_YUV422P10BE:
2818         case PIX_FMT_YUV420P10BE:
2819         case PIX_FMT_YUV420P16BE:
2820         case PIX_FMT_YUV422P16BE:
2821         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2822 #endif
2823     }
2824     if (c->chrSrcHSubSample) {
2825         switch(srcFormat) {
2826         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2827         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2828         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2829         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2830         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2831         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2832         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2833         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2834         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2835         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2836         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2837         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2838         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2839         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2840         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2841         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2842         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2843         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2844         }
2845     } else {
2846         switch(srcFormat) {
2847         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2848         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2849         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2850         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2851         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2852         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2853         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2854         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2855         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2856         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2857         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2858         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2859         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2860         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2861         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2862         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2863         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2864         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2865         }
2866     }
2867
2868     c->lumToYV12 = NULL;
2869     c->alpToYV12 = NULL;
2870     switch (srcFormat) {
2871 #if HAVE_BIGENDIAN
2872     case PIX_FMT_YUV444P9LE:
2873     case PIX_FMT_YUV420P9LE:
2874     case PIX_FMT_YUV422P10LE:
2875     case PIX_FMT_YUV420P10LE:
2876     case PIX_FMT_YUV444P10LE:
2877     case PIX_FMT_YUV420P16LE:
2878     case PIX_FMT_YUV422P16LE:
2879     case PIX_FMT_YUV444P16LE:
2880     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2881 #else
2882     case PIX_FMT_YUV444P9BE:
2883     case PIX_FMT_YUV420P9BE:
2884     case PIX_FMT_YUV444P10BE:
2885     case PIX_FMT_YUV422P10BE:
2886     case PIX_FMT_YUV420P10BE:
2887     case PIX_FMT_YUV420P16BE:
2888     case PIX_FMT_YUV422P16BE:
2889     case PIX_FMT_YUV444P16BE:
2890     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2891 #endif
2892     case PIX_FMT_YUYV422  :
2893     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2894     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2895     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2896     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2897     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2898     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2899     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2900     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2901     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2902     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2903     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2904     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2905     case PIX_FMT_RGB8     :
2906     case PIX_FMT_BGR8     :
2907     case PIX_FMT_PAL8     :
2908     case PIX_FMT_BGR4_BYTE:
2909     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2910     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2911     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2912     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2913     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2914     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2915     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2916     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2917     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2918     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2919     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2920     }
2921     if (c->alpPixBuf) {
2922         switch (srcFormat) {
2923         case PIX_FMT_BGRA:
2924         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2925         case PIX_FMT_ABGR:
2926         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2927         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2928         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
2929         }
2930     }
2931
2932
2933     if (c->srcBpc == 8) {
2934         if (c->dstBpc <= 10) {
2935             c->hyScale = c->hcScale = hScale8To15_c;
2936             if (c->flags & SWS_FAST_BILINEAR) {
2937                 c->hyscale_fast = hyscale_fast_c;
2938                 c->hcscale_fast = hcscale_fast_c;
2939             }
2940         } else {
2941             c->hyScale = c->hcScale = hScale8To19_c;
2942         }
2943     } else {
2944         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
2945     }
2946
2947     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2948         if (c->dstBpc <= 10) {
2949             if (c->srcRange) {
2950                 c->lumConvertRange = lumRangeFromJpeg_c;
2951                 c->chrConvertRange = chrRangeFromJpeg_c;
2952             } else {
2953                 c->lumConvertRange = lumRangeToJpeg_c;
2954                 c->chrConvertRange = chrRangeToJpeg_c;
2955             }
2956         } else {
2957             if (c->srcRange) {
2958                 c->lumConvertRange = lumRangeFromJpeg16_c;
2959                 c->chrConvertRange = chrRangeFromJpeg16_c;
2960             } else {
2961                 c->lumConvertRange = lumRangeToJpeg16_c;
2962                 c->chrConvertRange = chrRangeToJpeg16_c;
2963             }
2964         }
2965     }
2966
2967     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2968           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2969         c->needs_hcscale = 1;
2970 }
2971
2972 SwsFunc ff_getSwsFunc(SwsContext *c)
2973 {
2974     sws_init_swScale_c(c);
2975
2976     if (HAVE_MMX)
2977         ff_sws_init_swScale_mmx(c);
2978     if (HAVE_ALTIVEC)
2979         ff_sws_init_swScale_altivec(c);
2980
2981     return swScale;
2982 }