git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185
 186 static av_always_inline void
 187 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 188                       int lumFilterSize, const int16_t *chrFilter,
 189                       const int32_t **chrUSrc, const int32_t **chrVSrc,
 190                       int chrFilterSize, const int32_t **alpSrc,
 191                       uint16_t *dest[4], int dstW, int chrDstW,
 192                       int big_endian, int output_bits)
 193 {
 194     //FIXME Optimize (just quickly written not optimized..)
 195     int i;
 196     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 197              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 198     int shift = 15 + 16 - output_bits - 1;
 199
 200 #define output_pixel(pos, val) \
 201     if (big_endian) { \
 202         if (output_bits == 16) { \
 203             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 204         } else { \
 205             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 206         } \
 207     } else { \
 208         if (output_bits == 16) { \
 209             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 210         } else { \
 211             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 212         } \
 213     }
 214     for (i = 0; i < dstW; i++) {
 215         int val = 1 << (30-output_bits - 1);
 216         int j;
 217
 218         for (j = 0; j < lumFilterSize; j++)
 219             val += (lumSrc[j][i] * lumFilter[j]) >> 1;
 220
 221         output_pixel(&yDest[i], val);
 222     }
 223
 224     if (uDest) {
 225         for (i = 0; i < chrDstW; i++) {
 226             int u = 1 << (30-output_bits - 1);
 227             int v = 1 << (30-output_bits - 1);
 228             int j;
 229
 230             for (j = 0; j < chrFilterSize; j++) {
 231                 u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
 232                 v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
 233             }
 234
 235             output_pixel(&uDest[i], u);
 236             output_pixel(&vDest[i], v);
 237         }
 238     }
 239
 240     if (CONFIG_SWSCALE_ALPHA && aDest) {
 241         for (i = 0; i < dstW; i++) {
 242             int val = 1 << (30-output_bits - 1);
 243             int j;
 244
 245             for (j = 0; j < lumFilterSize; j++)
 246                 val += (alpSrc[j][i] * lumFilter[j]) >> 1;
 247
 248             output_pixel(&aDest[i], val);
 249         }
 250     }
 251 #undef output_pixel
 252 }
 253
 254 #define yuv2NBPS(bits, BE_LE, is_be) \
 255 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 256                               const int16_t **_lumSrc, int lumFilterSize, \
 257                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
 258                               const int16_t **_chrVSrc, \
 259                               int chrFilterSize, const int16_t **_alpSrc, \
 260                               uint8_t *_dest[4], int dstW, int chrDstW) \
 261 { \
 262     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 263                   **chrUSrc = (const int32_t **) _chrUSrc, \
 264                   **chrVSrc = (const int32_t **) _chrVSrc, \
 265                   **alpSrc  = (const int32_t **) _alpSrc; \
 266     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 267                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 268                           alpSrc, (uint16_t **) _dest, \
 269                           dstW, chrDstW, is_be, bits); \
 270 }
 271 yuv2NBPS( 9, BE, 1);
 272 yuv2NBPS( 9, LE, 0);
 273 yuv2NBPS(10, BE, 1);
 274 yuv2NBPS(10, LE, 0);
 275 yuv2NBPS(16, BE, 1);
 276 yuv2NBPS(16, LE, 0);
 277
 278 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 279                        const int16_t **lumSrc, int lumFilterSize,
 280                        const int16_t *chrFilter, const int16_t **chrUSrc,
 281                        const int16_t **chrVSrc,
 282                        int chrFilterSize, const int16_t **alpSrc,
 283                        uint8_t *dest[4], int dstW, int chrDstW)
 284 {
 285     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 286             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 287     int i;
 288
 289     //FIXME Optimize (just quickly written not optimized..)
 290     for (i=0; i<dstW; i++) {
 291         int val=1<<18;
 292         int j;
 293         for (j=0; j<lumFilterSize; j++)
 294             val += lumSrc[j][i] * lumFilter[j];
 295
 296         yDest[i]= av_clip_uint8(val>>19);
 297     }
 298
 299     if (uDest)
 300         for (i=0; i<chrDstW; i++) {
 301             int u=1<<18;
 302             int v=1<<18;
 303             int j;
 304             for (j=0; j<chrFilterSize; j++) {
 305                 u += chrUSrc[j][i] * chrFilter[j];
 306                 v += chrVSrc[j][i] * chrFilter[j];
 307             }
 308
 309             uDest[i]= av_clip_uint8(u>>19);
 310             vDest[i]= av_clip_uint8(v>>19);
 311         }
 312
 313     if (CONFIG_SWSCALE_ALPHA && aDest)
 314         for (i=0; i<dstW; i++) {
 315             int val=1<<18;
 316             int j;
 317             for (j=0; j<lumFilterSize; j++)
 318                 val += alpSrc[j][i] * lumFilter[j];
 319
 320             aDest[i]= av_clip_uint8(val>>19);
 321         }
 322 }
 323
 324 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 325                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 326                        const int16_t *alpSrc,
 327                        uint8_t *dest[4], int dstW, int chrDstW)
 328 {
 329     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 330             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 331     int i;
 332
 333     for (i=0; i<dstW; i++) {
 334         int val= (lumSrc[i]+64)>>7;
 335         yDest[i]= av_clip_uint8(val);
 336     }
 337
 338     if (uDest)
 339         for (i=0; i<chrDstW; i++) {
 340             int u=(chrUSrc[i]+64)>>7;
 341             int v=(chrVSrc[i]+64)>>7;
 342             uDest[i]= av_clip_uint8(u);
 343             vDest[i]= av_clip_uint8(v);
 344         }
 345
 346     if (CONFIG_SWSCALE_ALPHA && aDest)
 347         for (i=0; i<dstW; i++) {
 348             int val= (alpSrc[i]+64)>>7;
 349             aDest[i]= av_clip_uint8(val);
 350         }
 351 }
 352
 353 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 354                         const int16_t **lumSrc, int lumFilterSize,
 355                         const int16_t *chrFilter, const int16_t **chrUSrc,
 356                         const int16_t **chrVSrc, int chrFilterSize,
 357                         const int16_t **alpSrc, uint8_t *dest[4],
 358                         int dstW, int chrDstW)
 359 {
 360     uint8_t *yDest = dest[0], *uDest = dest[1];
 361     enum PixelFormat dstFormat = c->dstFormat;
 362
 363     //FIXME Optimize (just quickly written not optimized..)
 364     int i;
 365     for (i=0; i<dstW; i++) {
 366         int val=1<<18;
 367         int j;
 368         for (j=0; j<lumFilterSize; j++)
 369             val += lumSrc[j][i] * lumFilter[j];
 370
 371         yDest[i]= av_clip_uint8(val>>19);
 372     }
 373
 374     if (!uDest)
 375         return;
 376
 377     if (dstFormat == PIX_FMT_NV12)
 378         for (i=0; i<chrDstW; i++) {
 379             int u=1<<18;
 380             int v=1<<18;
 381             int j;
 382             for (j=0; j<chrFilterSize; j++) {
 383                 u += chrUSrc[j][i] * chrFilter[j];
 384                 v += chrVSrc[j][i] * chrFilter[j];
 385             }
 386
 387             uDest[2*i]= av_clip_uint8(u>>19);
 388             uDest[2*i+1]= av_clip_uint8(v>>19);
 389         }
 390     else
 391         for (i=0; i<chrDstW; i++) {
 392             int u=1<<18;
 393             int v=1<<18;
 394             int j;
 395             for (j=0; j<chrFilterSize; j++) {
 396                 u += chrUSrc[j][i] * chrFilter[j];
 397                 v += chrVSrc[j][i] * chrFilter[j];
 398             }
 399
 400             uDest[2*i]= av_clip_uint8(v>>19);
 401             uDest[2*i+1]= av_clip_uint8(u>>19);
 402         }
 403 }
 404
 405 #define output_pixel(pos, val) \
 406         if (target == PIX_FMT_GRAY16BE) { \
 407             AV_WB16(pos, val); \
 408         } else { \
 409             AV_WL16(pos, val); \
 410         }
 411
 412 static av_always_inline void
 413 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 414                         const int32_t **lumSrc, int lumFilterSize,
 415                         const int16_t *chrFilter, const int32_t **chrUSrc,
 416                         const int32_t **chrVSrc, int chrFilterSize,
 417                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 418                         int y, enum PixelFormat target)
 419 {
 420     int i;
 421
 422     for (i = 0; i < (dstW >> 1); i++) {
 423         int j;
 424         int Y1 = 1 << 14;
 425         int Y2 = 1 << 14;
 426
 427         for (j = 0; j < lumFilterSize; j++) {
 428             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 429             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 430         }
 431         Y1 >>= 15;
 432         Y2 >>= 15;
 433         if ((Y1 | Y2) & 0x10000) {
 434             Y1 = av_clip_uint16(Y1);
 435             Y2 = av_clip_uint16(Y2);
 436         }
 437         output_pixel(&dest[i * 2 + 0], Y1);
 438         output_pixel(&dest[i * 2 + 1], Y2);
 439     }
 440 }
 441
 442 static av_always_inline void
 443 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 444                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 445                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 446                         int yalpha, int uvalpha, int y,
 447                         enum PixelFormat target)
 448 {
 449     int  yalpha1 = 4095 - yalpha;
 450     int i;
 451     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 452
 453     for (i = 0; i < (dstW >> 1); i++) {
 454         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 455         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 456
 457         output_pixel(&dest[i * 2 + 0], Y1);
 458         output_pixel(&dest[i * 2 + 1], Y2);
 459     }
 460 }
 461
 462 static av_always_inline void
 463 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 464                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 465                         const int32_t *abuf0, uint16_t *dest, int dstW,
 466                         int uvalpha, int y, enum PixelFormat target)
 467 {
 468     int i;
 469
 470     for (i = 0; i < (dstW >> 1); i++) {
 471         int Y1 = buf0[i * 2    ] << 1;
 472         int Y2 = buf0[i * 2 + 1] << 1;
 473
 474         output_pixel(&dest[i * 2 + 0], Y1);
 475         output_pixel(&dest[i * 2 + 1], Y2);
 476     }
 477 }
 478
 479 #undef output_pixel
 480
 481 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 482 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 483                         const int16_t **_lumSrc, int lumFilterSize, \
 484                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 485                         const int16_t **_chrVSrc, int chrFilterSize, \
 486                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 487                         int y) \
 488 { \
 489     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 490                   **chrUSrc = (const int32_t **) _chrUSrc, \
 491                   **chrVSrc = (const int32_t **) _chrVSrc, \
 492                   **alpSrc  = (const int32_t **) _alpSrc; \
 493     uint16_t *dest = (uint16_t *) _dest; \
 494     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 495                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 496                           alpSrc, dest, dstW, y, fmt); \
 497 } \
 498  \
 499 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 500                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 501                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 502                         int yalpha, int uvalpha, int y) \
 503 { \
 504     const int32_t **buf  = (const int32_t **) _buf, \
 505                   **ubuf = (const int32_t **) _ubuf, \
 506                   **vbuf = (const int32_t **) _vbuf, \
 507                   **abuf = (const int32_t **) _abuf; \
 508     uint16_t *dest = (uint16_t *) _dest; \
 509     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 510                           dest, dstW, yalpha, uvalpha, y, fmt); \
 511 } \
 512  \
 513 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 514                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 515                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 516                         int uvalpha, int y) \
 517 { \
 518     const int32_t *buf0  = (const int32_t *)  _buf0, \
 519                  **ubuf  = (const int32_t **) _ubuf, \
 520                  **vbuf  = (const int32_t **) _vbuf, \
 521                   *abuf0 = (const int32_t *)  _abuf0; \
 522     uint16_t *dest = (uint16_t *) _dest; \
 523     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 524                                   dstW, uvalpha, y, fmt); \
 525 }
 526
 527 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 528 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 529
 530 #define output_pixel(pos, acc) \
 531     if (target == PIX_FMT_MONOBLACK) { \
 532         pos = acc; \
 533     } else { \
 534         pos = ~acc; \
 535     }
 536
 537 static av_always_inline void
 538 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 539                       const int16_t **lumSrc, int lumFilterSize,
 540                       const int16_t *chrFilter, const int16_t **chrUSrc,
 541                       const int16_t **chrVSrc, int chrFilterSize,
 542                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 543                       int y, enum PixelFormat target)
 544 {
 545     const uint8_t * const d128=dither_8x8_220[y&7];
 546     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 547     int i;
 548     int acc = 0;
 549
 550     for (i = 0; i < dstW - 1; i += 2) {
 551         int j;
 552         int Y1 = 1 << 18;
 553         int Y2 = 1 << 18;
 554
 555         for (j = 0; j < lumFilterSize; j++) {
 556             Y1 += lumSrc[j][i]   * lumFilter[j];
 557             Y2 += lumSrc[j][i+1] * lumFilter[j];
 558         }
 559         Y1 >>= 19;
 560         Y2 >>= 19;
 561         if ((Y1 | Y2) & 0x100) {
 562             Y1 = av_clip_uint8(Y1);
 563             Y2 = av_clip_uint8(Y2);
 564         }
 565         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 566         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 567         if ((i & 7) == 6) {
 568             output_pixel(*dest++, acc);
 569         }
 570     }
 571 }
 572
 573 static av_always_inline void
 574 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 575                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 576                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 577                       int yalpha, int uvalpha, int y,
 578                       enum PixelFormat target)
 579 {
 580     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 581     const uint8_t * const d128 = dither_8x8_220[y & 7];
 582     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 583     int  yalpha1 = 4095 - yalpha;
 584     int i;
 585
 586     for (i = 0; i < dstW - 7; i += 8) {
 587         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 588         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 589         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 590         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 591         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 592         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 593         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 594         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 595         output_pixel(*dest++, acc);
 596     }
 597 }
 598
 599 static av_always_inline void
 600 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 601                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 602                       const int16_t *abuf0, uint8_t *dest, int dstW,
 603                       int uvalpha, int y, enum PixelFormat target)
 604 {
 605     const uint8_t * const d128 = dither_8x8_220[y & 7];
 606     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 607     int i;
 608
 609     for (i = 0; i < dstW - 7; i += 8) {
 610         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 611         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 612         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 613         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 614         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 615         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 616         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 617         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 618         output_pixel(*dest++, acc);
 619     }
 620 }
 621
 622 #undef output_pixel
 623
 624 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 625 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 626                                 const int16_t **lumSrc, int lumFilterSize, \
 627                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 628                                 const int16_t **chrVSrc, int chrFilterSize, \
 629                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 630                                 int y) \
 631 { \
 632     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 633                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 634                                   alpSrc, dest, dstW, y, fmt); \
 635 } \
 636  \
 637 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 638                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 639                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 640                                 int yalpha, int uvalpha, int y) \
 641 { \
 642     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 643                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 644 } \
 645  \
 646 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 647                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 648                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 649                                 int uvalpha, int y) \
 650 { \
 651     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 652                                   abuf0, dest, dstW, uvalpha, \
 653                                   y, fmt); \
 654 }
 655
 656 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 657 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 658
 659 #define output_pixels(pos, Y1, U, Y2, V) \
 660     if (target == PIX_FMT_YUYV422) { \
 661         dest[pos + 0] = Y1; \
 662         dest[pos + 1] = U;  \
 663         dest[pos + 2] = Y2; \
 664         dest[pos + 3] = V;  \
 665     } else { \
 666         dest[pos + 0] = U;  \
 667         dest[pos + 1] = Y1; \
 668         dest[pos + 2] = V;  \
 669         dest[pos + 3] = Y2; \
 670     }
 671
 672 static av_always_inline void
 673 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 674                      const int16_t **lumSrc, int lumFilterSize,
 675                      const int16_t *chrFilter, const int16_t **chrUSrc,
 676                      const int16_t **chrVSrc, int chrFilterSize,
 677                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 678                      int y, enum PixelFormat target)
 679 {
 680     int i;
 681
 682     for (i = 0; i < (dstW >> 1); i++) {
 683         int j;
 684         int Y1 = 1 << 18;
 685         int Y2 = 1 << 18;
 686         int U  = 1 << 18;
 687         int V  = 1 << 18;
 688
 689         for (j = 0; j < lumFilterSize; j++) {
 690             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 691             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 692         }
 693         for (j = 0; j < chrFilterSize; j++) {
 694             U += chrUSrc[j][i] * chrFilter[j];
 695             V += chrVSrc[j][i] * chrFilter[j];
 696         }
 697         Y1 >>= 19;
 698         Y2 >>= 19;
 699         U  >>= 19;
 700         V  >>= 19;
 701         if ((Y1 | Y2 | U | V) & 0x100) {
 702             Y1 = av_clip_uint8(Y1);
 703             Y2 = av_clip_uint8(Y2);
 704             U  = av_clip_uint8(U);
 705             V  = av_clip_uint8(V);
 706         }
 707         output_pixels(4*i, Y1, U, Y2, V);
 708     }
 709 }
 710
 711 static av_always_inline void
 712 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 713                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 714                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 715                      int yalpha, int uvalpha, int y,
 716                      enum PixelFormat target)
 717 {
 718     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 719                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 720                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 721     int  yalpha1 = 4095 - yalpha;
 722     int uvalpha1 = 4095 - uvalpha;
 723     int i;
 724
 725     for (i = 0; i < (dstW >> 1); i++) {
 726         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 727         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 728         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 729         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 730
 731         output_pixels(i * 4, Y1, U, Y2, V);
 732     }
 733 }
 734
 735 static av_always_inline void
 736 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 737                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 738                      const int16_t *abuf0, uint8_t *dest, int dstW,
 739                      int uvalpha, int y, enum PixelFormat target)
 740 {
 741     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 742                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 743     int i;
 744
 745     if (uvalpha < 2048) {
 746         for (i = 0; i < (dstW >> 1); i++) {
 747             int Y1 = buf0[i * 2]     >> 7;
 748             int Y2 = buf0[i * 2 + 1] >> 7;
 749             int U  = ubuf1[i]        >> 7;
 750             int V  = vbuf1[i]        >> 7;
 751
 752             output_pixels(i * 4, Y1, U, Y2, V);
 753         }
 754     } else {
 755         for (i = 0; i < (dstW >> 1); i++) {
 756             int Y1 =  buf0[i * 2]          >> 7;
 757             int Y2 =  buf0[i * 2 + 1]      >> 7;
 758             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 759             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 760
 761             output_pixels(i * 4, Y1, U, Y2, V);
 762         }
 763     }
 764 }
 765
 766 #undef output_pixels
 767
 768 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 769 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 770
 771 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 772 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 773 #define output_pixel(pos, val) \
 774     if (isBE(target)) { \
 775         AV_WB16(pos, val); \
 776     } else { \
 777         AV_WL16(pos, val); \
 778     }
 779
 780 static av_always_inline void
 781 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 782                        const int32_t **lumSrc, int lumFilterSize,
 783                        const int16_t *chrFilter, const int32_t **chrUSrc,
 784                        const int32_t **chrVSrc, int chrFilterSize,
 785                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 786                        int y, enum PixelFormat target)
 787 {
 788     int i;
 789
 790     for (i = 0; i < (dstW >> 1); i++) {
 791         int j;
 792         int Y1 = 0;
 793         int Y2 = 0;
 794         int U  = -128 << 23; // 19
 795         int V  = -128 << 23;
 796         int R, G, B;
 797
 798         for (j = 0; j < lumFilterSize; j++) {
 799             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 800             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 801         }
 802         for (j = 0; j < chrFilterSize; j++) {
 803             U += chrUSrc[j][i] * chrFilter[j];
 804             V += chrVSrc[j][i] * chrFilter[j];
 805         }
 806
 807         // 8bit: 12+15=27; 16-bit: 12+19=31
 808         Y1 >>= 14; // 10
 809         Y2 >>= 14;
 810         U  >>= 14;
 811         V  >>= 14;
 812
 813         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 814         Y1 -= c->yuv2rgb_y_offset;
 815         Y2 -= c->yuv2rgb_y_offset;
 816         Y1 *= c->yuv2rgb_y_coeff;
 817         Y2 *= c->yuv2rgb_y_coeff;
 818         Y1 += 1 << 13; // 21
 819         Y2 += 1 << 13;
 820         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 821
 822         R = V * c->yuv2rgb_v2r_coeff;
 823         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 824         B =                            U * c->yuv2rgb_u2b_coeff;
 825
 826         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 827         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 828         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 829         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 830         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 831         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 832         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 833         dest += 6;
 834     }
 835 }
 836
 837 static av_always_inline void
 838 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 839                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 840                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 841                        int yalpha, int uvalpha, int y,
 842                        enum PixelFormat target)
 843 {
 844     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 845                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 846                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 847     int  yalpha1 = 4095 - yalpha;
 848     int uvalpha1 = 4095 - uvalpha;
 849     int i;
 850
 851     for (i = 0; i < (dstW >> 1); i++) {
 852         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 853         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 854         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 855         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 856         int R, G, B;
 857
 858         Y1 -= c->yuv2rgb_y_offset;
 859         Y2 -= c->yuv2rgb_y_offset;
 860         Y1 *= c->yuv2rgb_y_coeff;
 861         Y2 *= c->yuv2rgb_y_coeff;
 862         Y1 += 1 << 13;
 863         Y2 += 1 << 13;
 864
 865         R = V * c->yuv2rgb_v2r_coeff;
 866         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 867         B =                            U * c->yuv2rgb_u2b_coeff;
 868
 869         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 870         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 871         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 872         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 873         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 874         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 875         dest += 6;
 876     }
 877 }
 878
 879 static av_always_inline void
 880 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 881                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 882                        const int32_t *abuf0, uint16_t *dest, int dstW,
 883                        int uvalpha, int y, enum PixelFormat target)
 884 {
 885     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 886                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 887     int i;
 888
 889     if (uvalpha < 2048) {
 890         for (i = 0; i < (dstW >> 1); i++) {
 891             int Y1 = (buf0[i * 2]    ) >> 2;
 892             int Y2 = (buf0[i * 2 + 1]) >> 2;
 893             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 894             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 895             int R, G, B;
 896
 897             Y1 -= c->yuv2rgb_y_offset;
 898             Y2 -= c->yuv2rgb_y_offset;
 899             Y1 *= c->yuv2rgb_y_coeff;
 900             Y2 *= c->yuv2rgb_y_coeff;
 901             Y1 += 1 << 13;
 902             Y2 += 1 << 13;
 903
 904             R = V * c->yuv2rgb_v2r_coeff;
 905             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 906             B =                            U * c->yuv2rgb_u2b_coeff;
 907
 908             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 909             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 910             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 911             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 912             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 913             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 914             dest += 6;
 915         }
 916     } else {
 917         for (i = 0; i < (dstW >> 1); i++) {
 918             int Y1 = (buf0[i * 2]    ) >> 2;
 919             int Y2 = (buf0[i * 2 + 1]) >> 2;
 920             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 11)) >> 3;
 921             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 11)) >> 3;
 922             int R, G, B;
 923
 924             Y1 -= c->yuv2rgb_y_offset;
 925             Y2 -= c->yuv2rgb_y_offset;
 926             Y1 *= c->yuv2rgb_y_coeff;
 927             Y2 *= c->yuv2rgb_y_coeff;
 928             Y1 += 1 << 13;
 929             Y2 += 1 << 13;
 930
 931             R = V * c->yuv2rgb_v2r_coeff;
 932             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 933             B =                            U * c->yuv2rgb_u2b_coeff;
 934
 935             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 936             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 937             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 938             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 939             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 940             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 941             dest += 6;
 942         }
 943     }
 944 }
 945
 946 #undef output_pixel
 947 #undef r_b
 948 #undef b_r
 949
 950 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
 951 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
 952 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
 953 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
 954
 955 static av_always_inline void
 956 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
 957               int U, int V, int A1, int A2,
 958               const void *_r, const void *_g, const void *_b, int y,
 959               enum PixelFormat target, int hasAlpha)
 960 {
 961     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 962         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
 963         uint32_t *dest = (uint32_t *) _dest;
 964         const uint32_t *r = (const uint32_t *) _r;
 965         const uint32_t *g = (const uint32_t *) _g;
 966         const uint32_t *b = (const uint32_t *) _b;
 967
 968 #if CONFIG_SMALL
 969         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 970
 971         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
 972         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
 973 #else
 974         if (hasAlpha) {
 975             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
 976
 977             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
 978             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
 979         } else {
 980             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
 981             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
 982         }
 983 #endif
 984     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
 985         uint8_t *dest = (uint8_t *) _dest;
 986         const uint8_t *r = (const uint8_t *) _r;
 987         const uint8_t *g = (const uint8_t *) _g;
 988         const uint8_t *b = (const uint8_t *) _b;
 989
 990 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
 991 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
 992         dest[i * 6 + 0] = r_b[Y1];
 993         dest[i * 6 + 1] =   g[Y1];
 994         dest[i * 6 + 2] = b_r[Y1];
 995         dest[i * 6 + 3] = r_b[Y2];
 996         dest[i * 6 + 4] =   g[Y2];
 997         dest[i * 6 + 5] = b_r[Y2];
 998 #undef r_b
 999 #undef b_r
1000     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1001                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1002                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1003         uint16_t *dest = (uint16_t *) _dest;
1004         const uint16_t *r = (const uint16_t *) _r;
1005         const uint16_t *g = (const uint16_t *) _g;
1006         const uint16_t *b = (const uint16_t *) _b;
1007         int dr1, dg1, db1, dr2, dg2, db2;
1008
1009         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1010             dr1 = dither_2x2_8[ y & 1     ][0];
1011             dg1 = dither_2x2_4[ y & 1     ][0];
1012             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1013             dr2 = dither_2x2_8[ y & 1     ][1];
1014             dg2 = dither_2x2_4[ y & 1     ][1];
1015             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1016         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1017             dr1 = dither_2x2_8[ y & 1     ][0];
1018             dg1 = dither_2x2_8[ y & 1     ][1];
1019             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1020             dr2 = dither_2x2_8[ y & 1     ][1];
1021             dg2 = dither_2x2_8[ y & 1     ][0];
1022             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1023         } else {
1024             dr1 = dither_4x4_16[ y & 3     ][0];
1025             dg1 = dither_4x4_16[ y & 3     ][1];
1026             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1027             dr2 = dither_4x4_16[ y & 3     ][1];
1028             dg2 = dither_4x4_16[ y & 3     ][0];
1029             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1030         }
1031
1032         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1033         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1034     } else /* 8/4-bit */ {
1035         uint8_t *dest = (uint8_t *) _dest;
1036         const uint8_t *r = (const uint8_t *) _r;
1037         const uint8_t *g = (const uint8_t *) _g;
1038         const uint8_t *b = (const uint8_t *) _b;
1039         int dr1, dg1, db1, dr2, dg2, db2;
1040
1041         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1042             const uint8_t * const d64 = dither_8x8_73[y & 7];
1043             const uint8_t * const d32 = dither_8x8_32[y & 7];
1044             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1045             db1 =       d64[(i * 2 + 0) & 7];
1046             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1047             db2 =       d64[(i * 2 + 1) & 7];
1048         } else {
1049             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1050             const uint8_t * const d128 = dither_8x8_220[y & 7];
1051             dr1 = db1 = d128[(i * 2 + 0) & 7];
1052             dg1 =        d64[(i * 2 + 0) & 7];
1053             dr2 = db2 = d128[(i * 2 + 1) & 7];
1054             dg2 =        d64[(i * 2 + 1) & 7];
1055         }
1056
1057         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1058             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1059                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1060         } else {
1061             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1062             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1063         }
1064     }
1065 }
1066
1067 static av_always_inline void
1068 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1069                      const int16_t **lumSrc, int lumFilterSize,
1070                      const int16_t *chrFilter, const int16_t **chrUSrc,
1071                      const int16_t **chrVSrc, int chrFilterSize,
1072                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1073                      int y, enum PixelFormat target, int hasAlpha)
1074 {
1075     int i;
1076
1077     for (i = 0; i < (dstW >> 1); i++) {
1078         int j;
1079         int Y1 = 1 << 18;
1080         int Y2 = 1 << 18;
1081         int U  = 1 << 18;
1082         int V  = 1 << 18;
1083         int av_unused A1, A2;
1084         const void *r, *g, *b;
1085
1086         for (j = 0; j < lumFilterSize; j++) {
1087             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1088             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1089         }
1090         for (j = 0; j < chrFilterSize; j++) {
1091             U += chrUSrc[j][i] * chrFilter[j];
1092             V += chrVSrc[j][i] * chrFilter[j];
1093         }
1094         Y1 >>= 19;
1095         Y2 >>= 19;
1096         U  >>= 19;
1097         V  >>= 19;
1098         if ((Y1 | Y2 | U | V) & 0x100) {
1099             Y1 = av_clip_uint8(Y1);
1100             Y2 = av_clip_uint8(Y2);
1101             U  = av_clip_uint8(U);
1102             V  = av_clip_uint8(V);
1103         }
1104         if (hasAlpha) {
1105             A1 = 1 << 18;
1106             A2 = 1 << 18;
1107             for (j = 0; j < lumFilterSize; j++) {
1108                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1109                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1110             }
1111             A1 >>= 19;
1112             A2 >>= 19;
1113             if ((A1 | A2) & 0x100) {
1114                 A1 = av_clip_uint8(A1);
1115                 A2 = av_clip_uint8(A2);
1116             }
1117         }
1118
1119         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1120         r =  c->table_rV[V];
1121         g = (c->table_gU[U] + c->table_gV[V]);
1122         b =  c->table_bU[U];
1123
1124         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1125                       r, g, b, y, target, hasAlpha);
1126     }
1127 }
1128
1129 static av_always_inline void
1130 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1131                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1132                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1133                      int yalpha, int uvalpha, int y,
1134                      enum PixelFormat target, int hasAlpha)
1135 {
1136     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1137                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1138                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1139                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1140                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1141     int  yalpha1 = 4095 - yalpha;
1142     int uvalpha1 = 4095 - uvalpha;
1143     int i;
1144
1145     for (i = 0; i < (dstW >> 1); i++) {
1146         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1147         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1148         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1149         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1150         int A1, A2;
1151         const void *r =  c->table_rV[V],
1152                    *g = (c->table_gU[U] + c->table_gV[V]),
1153                    *b =  c->table_bU[U];
1154
1155         if (hasAlpha) {
1156             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1157             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1158         }
1159
1160         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1161                       r, g, b, y, target, hasAlpha);
1162     }
1163 }
1164
1165 static av_always_inline void
1166 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1167                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1168                      const int16_t *abuf0, uint8_t *dest, int dstW,
1169                      int uvalpha, int y, enum PixelFormat target,
1170                      int hasAlpha)
1171 {
1172     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1173                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1174     int i;
1175
1176     if (uvalpha < 2048) {
1177         for (i = 0; i < (dstW >> 1); i++) {
1178             int Y1 = buf0[i * 2]     >> 7;
1179             int Y2 = buf0[i * 2 + 1] >> 7;
1180             int U  = ubuf1[i]        >> 7;
1181             int V  = vbuf1[i]        >> 7;
1182             int A1, A2;
1183             const void *r =  c->table_rV[V],
1184                        *g = (c->table_gU[U] + c->table_gV[V]),
1185                        *b =  c->table_bU[U];
1186
1187             if (hasAlpha) {
1188                 A1 = abuf0[i * 2    ] >> 7;
1189                 A2 = abuf0[i * 2 + 1] >> 7;
1190             }
1191
1192             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1193                           r, g, b, y, target, hasAlpha);
1194         }
1195     } else {
1196         for (i = 0; i < (dstW >> 1); i++) {
1197             int Y1 =  buf0[i * 2]          >> 7;
1198             int Y2 =  buf0[i * 2 + 1]      >> 7;
1199             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1200             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1201             int A1, A2;
1202             const void *r =  c->table_rV[V],
1203                        *g = (c->table_gU[U] + c->table_gV[V]),
1204                        *b =  c->table_bU[U];
1205
1206             if (hasAlpha) {
1207                 A1 = abuf0[i * 2    ] >> 7;
1208                 A2 = abuf0[i * 2 + 1] >> 7;
1209             }
1210
1211             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1212                           r, g, b, y, target, hasAlpha);
1213         }
1214     }
1215 }
1216
1217 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1218 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1219                                 const int16_t **lumSrc, int lumFilterSize, \
1220                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1221                                 const int16_t **chrVSrc, int chrFilterSize, \
1222                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1223                                 int y) \
1224 { \
1225     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1226                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1227                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1228 }
1229 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1230 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1231 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1232                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1233                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1234                                 int yalpha, int uvalpha, int y) \
1235 { \
1236     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1237                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1238 } \
1239  \
1240 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1241                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1242                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1243                                 int uvalpha, int y) \
1244 { \
1245     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1246                                   dstW, uvalpha, y, fmt, hasAlpha); \
1247 }
1248
1249 #if CONFIG_SMALL
1250 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1251 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1252 #else
1253 #if CONFIG_SWSCALE_ALPHA
1254 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1255 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1256 #endif
1257 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1258 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1259 #endif
1260 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1261 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1262 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1263 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1264 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1265 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1266 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1267 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1268
1269 static av_always_inline void
1270 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1271                           const int16_t **lumSrc, int lumFilterSize,
1272                           const int16_t *chrFilter, const int16_t **chrUSrc,
1273                           const int16_t **chrVSrc, int chrFilterSize,
1274                           const int16_t **alpSrc, uint8_t *dest,
1275                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1276 {
1277     int i;
1278     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1279
1280     for (i = 0; i < dstW; i++) {
1281         int j;
1282         int Y = 0;
1283         int U = -128 << 19;
1284         int V = -128 << 19;
1285         int av_unused A;
1286         int R, G, B;
1287
1288         for (j = 0; j < lumFilterSize; j++) {
1289             Y += lumSrc[j][i] * lumFilter[j];
1290         }
1291         for (j = 0; j < chrFilterSize; j++) {
1292             U += chrUSrc[j][i] * chrFilter[j];
1293             V += chrVSrc[j][i] * chrFilter[j];
1294         }
1295         Y >>= 10;
1296         U >>= 10;
1297         V >>= 10;
1298         if (hasAlpha) {
1299             A = 1 << 21;
1300             for (j = 0; j < lumFilterSize; j++) {
1301                 A += alpSrc[j][i] * lumFilter[j];
1302             }
1303             A >>= 19;
1304             if (A & 0x100)
1305                 A = av_clip_uint8(A);
1306         }
1307         Y -= c->yuv2rgb_y_offset;
1308         Y *= c->yuv2rgb_y_coeff;
1309         Y += 1 << 21;
1310         R = Y + V*c->yuv2rgb_v2r_coeff;
1311         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1312         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1313         if ((R | G | B) & 0xC0000000) {
1314             R = av_clip_uintp2(R, 30);
1315             G = av_clip_uintp2(G, 30);
1316             B = av_clip_uintp2(B, 30);
1317         }
1318
1319         switch(target) {
1320         case PIX_FMT_ARGB:
1321             dest[0] = hasAlpha ? A : 255;
1322             dest[1] = R >> 22;
1323             dest[2] = G >> 22;
1324             dest[3] = B >> 22;
1325             break;
1326         case PIX_FMT_RGB24:
1327             dest[0] = R >> 22;
1328             dest[1] = G >> 22;
1329             dest[2] = B >> 22;
1330             break;
1331         case PIX_FMT_RGBA:
1332             dest[0] = R >> 22;
1333             dest[1] = G >> 22;
1334             dest[2] = B >> 22;
1335             dest[3] = hasAlpha ? A : 255;
1336             break;
1337         case PIX_FMT_ABGR:
1338             dest[0] = hasAlpha ? A : 255;
1339             dest[1] = B >> 22;
1340             dest[2] = G >> 22;
1341             dest[3] = R >> 22;
1342             dest += 4;
1343             break;
1344         case PIX_FMT_BGR24:
1345             dest[0] = B >> 22;
1346             dest[1] = G >> 22;
1347             dest[2] = R >> 22;
1348             break;
1349         case PIX_FMT_BGRA:
1350             dest[0] = B >> 22;
1351             dest[1] = G >> 22;
1352             dest[2] = R >> 22;
1353             dest[3] = hasAlpha ? A : 255;
1354             break;
1355         }
1356         dest += step;
1357     }
1358 }
1359
1360 #if CONFIG_SMALL
1361 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1362 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1363 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1364 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1365 #else
1366 #if CONFIG_SWSCALE_ALPHA
1367 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1368 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1369 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1370 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1371 #endif
1372 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1373 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1374 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1375 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1376 #endif
1377 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1378 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1379
1380 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1381                                        int width, int height,
1382                                        int y, uint8_t val)
1383 {
1384     int i;
1385     uint8_t *ptr = plane + stride*y;
1386     for (i=0; i<height; i++) {
1387         memset(ptr, val, width);
1388         ptr += stride;
1389     }
1390 }
1391
1392 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1393
1394 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1395 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1396
1397 static av_always_inline void
1398 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1399                     enum PixelFormat origin)
1400 {
1401     int i;
1402     for (i = 0; i < width; i++) {
1403         unsigned int r_b = input_pixel(&src[i*3+0]);
1404         unsigned int   g = input_pixel(&src[i*3+1]);
1405         unsigned int b_r = input_pixel(&src[i*3+2]);
1406
1407         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1408     }
1409 }
1410
1411 static av_always_inline void
1412 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1413                     const uint16_t *src1, const uint16_t *src2,
1414                     int width, enum PixelFormat origin)
1415 {
1416     int i;
1417     assert(src1==src2);
1418     for (i = 0; i < width; i++) {
1419         int r_b = input_pixel(&src1[i*3+0]);
1420         int   g = input_pixel(&src1[i*3+1]);
1421         int b_r = input_pixel(&src1[i*3+2]);
1422
1423         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1424         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1425     }
1426 }
1427
1428 static av_always_inline void
1429 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1430                           const uint16_t *src1, const uint16_t *src2,
1431                           int width, enum PixelFormat origin)
1432 {
1433     int i;
1434     assert(src1==src2);
1435     for (i = 0; i < width; i++) {
1436         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1437         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1438         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1439
1440         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1441         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1442     }
1443 }
1444
1445 #undef r
1446 #undef b
1447 #undef input_pixel
1448
1449 #define rgb48funcs(pattern, BE_LE, origin) \
1450 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1451                                     int width, uint32_t *unused) \
1452 { \
1453     const uint16_t *src = (const uint16_t *) _src; \
1454     uint16_t *dst = (uint16_t *) _dst; \
1455     rgb48ToY_c_template(dst, src, width, origin); \
1456 } \
1457  \
1458 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1459                                     const uint8_t *_src1, const uint8_t *_src2, \
1460                                     int width, uint32_t *unused) \
1461 { \
1462     const uint16_t *src1 = (const uint16_t *) _src1, \
1463                    *src2 = (const uint16_t *) _src2; \
1464     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1465     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1466 } \
1467  \
1468 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1469                                     const uint8_t *_src1, const uint8_t *_src2, \
1470                                     int width, uint32_t *unused) \
1471 { \
1472     const uint16_t *src1 = (const uint16_t *) _src1, \
1473                    *src2 = (const uint16_t *) _src2; \
1474     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1475     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1476 }
1477
1478 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1479 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1480 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1481 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1482
1483 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1484                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1485                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1486
1487 static av_always_inline void
1488 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1489                        int width, enum PixelFormat origin,
1490                        int shr,   int shg,   int shb, int shp,
1491                        int maskr, int maskg, int maskb,
1492                        int rsh,   int gsh,   int bsh, int S)
1493 {
1494     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1495               rnd = 33 << (S - 1);
1496     int i;
1497
1498     for (i = 0; i < width; i++) {
1499         int px = input_pixel(i) >> shp;
1500         int b = (px & maskb) >> shb;
1501         int g = (px & maskg) >> shg;
1502         int r = (px & maskr) >> shr;
1503
1504         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1505     }
1506 }
1507
1508 static av_always_inline void
1509 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1510                         const uint8_t *src, int width,
1511                         enum PixelFormat origin,
1512                         int shr,   int shg,   int shb, int shp,
1513                         int maskr, int maskg, int maskb,
1514                         int rsh,   int gsh,   int bsh, int S)
1515 {
1516     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1517               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1518               rnd = 257 << (S - 1);
1519     int i;
1520
1521     for (i = 0; i < width; i++) {
1522         int px = input_pixel(i) >> shp;
1523         int b = (px & maskb) >> shb;
1524         int g = (px & maskg) >> shg;
1525         int r = (px & maskr) >> shr;
1526
1527         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1528         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1529     }
1530 }
1531
1532 static av_always_inline void
1533 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1534                              const uint8_t *src, int width,
1535                              enum PixelFormat origin,
1536                              int shr,   int shg,   int shb, int shp,
1537                              int maskr, int maskg, int maskb,
1538                              int rsh,   int gsh,   int bsh, int S)
1539 {
1540     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1541               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1542               rnd = 257 << S, maskgx = ~(maskr | maskb);
1543     int i;
1544
1545     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1546     for (i = 0; i < width; i++) {
1547         int px0 = input_pixel(2 * i + 0) >> shp;
1548         int px1 = input_pixel(2 * i + 1) >> shp;
1549         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1550         int rb = px0 + px1 - g;
1551
1552         b = (rb & maskb) >> shb;
1553         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1554             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1555             g >>= shg;
1556         } else {
1557             g = (g  & maskg) >> shg;
1558         }
1559         r = (rb & maskr) >> shr;
1560
1561         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1562         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1563     }
1564 }
1565
1566 #undef input_pixel
1567
1568 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1569                          maskg, maskb, rsh, gsh, bsh, S) \
1570 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1571                           int width, uint32_t *unused) \
1572 { \
1573     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1574                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1575 } \
1576  \
1577 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1578                            const uint8_t *src, const uint8_t *dummy, \
1579                            int width, uint32_t *unused) \
1580 { \
1581     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1582                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1583 } \
1584  \
1585 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1586                                 const uint8_t *src, const uint8_t *dummy, \
1587                                 int width, uint32_t *unused) \
1588 { \
1589     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1590                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1591 }
1592
1593 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1594 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1595 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1596 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1597 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1598 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1599 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1600 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1601 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1602 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1603 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1604 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1605
1606 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1607 {
1608     int i;
1609     for (i=0; i<width; i++) {
1610         dst[i]= src[4*i];
1611     }
1612 }
1613
1614 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1615 {
1616     int i;
1617     for (i=0; i<width; i++) {
1618         dst[i]= src[4*i+3];
1619     }
1620 }
1621
1622 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1623 {
1624     int i;
1625     for (i=0; i<width; i++) {
1626         int d= src[i];
1627
1628         dst[i]= pal[d] & 0xFF;
1629     }
1630 }
1631
1632 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1633                       const uint8_t *src1, const uint8_t *src2,
1634                       int width, uint32_t *pal)
1635 {
1636     int i;
1637     assert(src1 == src2);
1638     for (i=0; i<width; i++) {
1639         int p= pal[src1[i]];
1640
1641         dstU[i]= p>>8;
1642         dstV[i]= p>>16;
1643     }
1644 }
1645
1646 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1647                           int width, uint32_t *unused)
1648 {
1649     int i, j;
1650     for (i=0; i<width/8; i++) {
1651         int d= ~src[i];
1652         for(j=0; j<8; j++)
1653             dst[8*i+j]= ((d>>(7-j))&1)*255;
1654     }
1655 }
1656
1657 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1658                           int width, uint32_t *unused)
1659 {
1660     int i, j;
1661     for (i=0; i<width/8; i++) {
1662         int d= src[i];
1663         for(j=0; j<8; j++)
1664             dst[8*i+j]= ((d>>(7-j))&1)*255;
1665     }
1666 }
1667
1668 //FIXME yuy2* can read up to 7 samples too much
1669
1670 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1671                       uint32_t *unused)
1672 {
1673     int i;
1674     for (i=0; i<width; i++)
1675         dst[i]= src[2*i];
1676 }
1677
1678 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1679                        const uint8_t *src2, int width, uint32_t *unused)
1680 {
1681     int i;
1682     for (i=0; i<width; i++) {
1683         dstU[i]= src1[4*i + 1];
1684         dstV[i]= src1[4*i + 3];
1685     }
1686     assert(src1 == src2);
1687 }
1688
1689 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1690 {
1691     int i;
1692     const uint16_t *src = (const uint16_t *) _src;
1693     uint16_t *dst = (uint16_t *) _dst;
1694     for (i=0; i<width; i++) {
1695         dst[i] = av_bswap16(src[i]);
1696     }
1697 }
1698
1699 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1700                         const uint8_t *_src2, int width, uint32_t *unused)
1701 {
1702     int i;
1703     const uint16_t *src1 = (const uint16_t *) _src1,
1704                    *src2 = (const uint16_t *) _src2;
1705     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1706     for (i=0; i<width; i++) {
1707         dstU[i] = av_bswap16(src1[i]);
1708         dstV[i] = av_bswap16(src2[i]);
1709     }
1710 }
1711
1712 /* This is almost identical to the previous, end exists only because
1713  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1714 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1715                       uint32_t *unused)
1716 {
1717     int i;
1718     for (i=0; i<width; i++)
1719         dst[i]= src[2*i+1];
1720 }
1721
1722 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1723                        const uint8_t *src2, int width, uint32_t *unused)
1724 {
1725     int i;
1726     for (i=0; i<width; i++) {
1727         dstU[i]= src1[4*i + 0];
1728         dstV[i]= src1[4*i + 2];
1729     }
1730     assert(src1 == src2);
1731 }
1732
1733 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1734                                         const uint8_t *src, int width)
1735 {
1736     int i;
1737     for (i = 0; i < width; i++) {
1738         dst1[i] = src[2*i+0];
1739         dst2[i] = src[2*i+1];
1740     }
1741 }
1742
1743 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1744                        const uint8_t *src1, const uint8_t *src2,
1745                        int width, uint32_t *unused)
1746 {
1747     nvXXtoUV_c(dstU, dstV, src1, width);
1748 }
1749
1750 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1751                        const uint8_t *src1, const uint8_t *src2,
1752                        int width, uint32_t *unused)
1753 {
1754     nvXXtoUV_c(dstV, dstU, src1, width);
1755 }
1756
1757 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1758
1759 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1760                        int width, uint32_t *unused)
1761 {
1762     int i;
1763     for (i=0; i<width; i++) {
1764         int b= src[i*3+0];
1765         int g= src[i*3+1];
1766         int r= src[i*3+2];
1767
1768         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1769     }
1770 }
1771
1772 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1773                         const uint8_t *src2, int width, uint32_t *unused)
1774 {
1775     int i;
1776     for (i=0; i<width; i++) {
1777         int b= src1[3*i + 0];
1778         int g= src1[3*i + 1];
1779         int r= src1[3*i + 2];
1780
1781         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1782         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1783     }
1784     assert(src1 == src2);
1785 }
1786
1787 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1788                              const uint8_t *src2, int width, uint32_t *unused)
1789 {
1790     int i;
1791     for (i=0; i<width; i++) {
1792         int b= src1[6*i + 0] + src1[6*i + 3];
1793         int g= src1[6*i + 1] + src1[6*i + 4];
1794         int r= src1[6*i + 2] + src1[6*i + 5];
1795
1796         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1797         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1798     }
1799     assert(src1 == src2);
1800 }
1801
1802 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1803                        uint32_t *unused)
1804 {
1805     int i;
1806     for (i=0; i<width; i++) {
1807         int r= src[i*3+0];
1808         int g= src[i*3+1];
1809         int b= src[i*3+2];
1810
1811         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1812     }
1813 }
1814
1815 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1816                         const uint8_t *src2, int width, uint32_t *unused)
1817 {
1818     int i;
1819     assert(src1==src2);
1820     for (i=0; i<width; i++) {
1821         int r= src1[3*i + 0];
1822         int g= src1[3*i + 1];
1823         int b= src1[3*i + 2];
1824
1825         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1826         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1827     }
1828 }
1829
1830 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1831                              const uint8_t *src2, int width, uint32_t *unused)
1832 {
1833     int i;
1834     assert(src1==src2);
1835     for (i=0; i<width; i++) {
1836         int r= src1[6*i + 0] + src1[6*i + 3];
1837         int g= src1[6*i + 1] + src1[6*i + 4];
1838         int b= src1[6*i + 2] + src1[6*i + 5];
1839
1840         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1841         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1842     }
1843 }
1844
1845 static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
1846                        const int16_t *filter,
1847                        const int16_t *filterPos, int filterSize)
1848 {
1849     int i;
1850     int32_t *dst = (int32_t *) _dst;
1851     const uint16_t *src = (const uint16_t *) _src;
1852     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1853     int sh = (bits <= 7) ? 11 : (bits - 4);
1854
1855     for (i = 0; i < dstW; i++) {
1856         int j;
1857         int srcPos = filterPos[i];
1858         int val = 0;
1859
1860         for (j = 0; j < filterSize; j++) {
1861             val += src[srcPos + j] * filter[filterSize * i + j];
1862         }
1863         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1864         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1865     }
1866 }
1867
1868 // bilinear / bicubic scaling
1869 static void hScale_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
1870                      const int16_t *filter, const int16_t *filterPos,
1871                      int filterSize)
1872 {
1873     int i;
1874     for (i=0; i<dstW; i++) {
1875         int j;
1876         int srcPos= filterPos[i];
1877         int val=0;
1878         for (j=0; j<filterSize; j++) {
1879             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1880         }
1881         //filter += hFilterSize;
1882         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1883         //dst[i] = val>>7;
1884     }
1885 }
1886
1887 //FIXME all pal and rgb srcFormats could do this convertion as well
1888 //FIXME all scalers more complex than bilinear could do half of this transform
1889 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1890 {
1891     int i;
1892     for (i = 0; i < width; i++) {
1893         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1894         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1895     }
1896 }
1897 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1898 {
1899     int i;
1900     for (i = 0; i < width; i++) {
1901         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1902         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1903     }
1904 }
1905 static void lumRangeToJpeg_c(int16_t *dst, int width)
1906 {
1907     int i;
1908     for (i = 0; i < width; i++)
1909         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1910 }
1911 static void lumRangeFromJpeg_c(int16_t *dst, int width)
1912 {
1913     int i;
1914     for (i = 0; i < width; i++)
1915         dst[i] = (dst[i]*14071 + 33561947)>>14;
1916 }
1917
1918 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
1919 {
1920     int i;
1921     int32_t *dstU = (int32_t *) _dstU;
1922     int32_t *dstV = (int32_t *) _dstV;
1923     for (i = 0; i < width; i++) {
1924         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
1925         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
1926     }
1927 }
1928 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
1929 {
1930     int i;
1931     int32_t *dstU = (int32_t *) _dstU;
1932     int32_t *dstV = (int32_t *) _dstV;
1933     for (i = 0; i < width; i++) {
1934         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
1935         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
1936     }
1937 }
1938 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
1939 {
1940     int i;
1941     int32_t *dst = (int32_t *) _dst;
1942     for (i = 0; i < width; i++)
1943         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
1944 }
1945 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
1946 {
1947     int i;
1948     int32_t *dst = (int32_t *) _dst;
1949     for (i = 0; i < width; i++)
1950         dst[i] = (dst[i]*14071 + (33561947<<4))>>14;
1951 }
1952
1953 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1954                            const uint8_t *src, int srcW, int xInc)
1955 {
1956     int i;
1957     unsigned int xpos=0;
1958     for (i=0;i<dstWidth;i++) {
1959         register unsigned int xx=xpos>>16;
1960         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1961         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1962         xpos+=xInc;
1963     }
1964 }
1965
1966 static void scale8To16Rv_c(uint16_t *_dst, const uint8_t *src, int len)
1967 {
1968     int i;
1969     uint8_t *dst = (uint8_t *) _dst;
1970     for (i = len - 1; i >= 0; i--) {
1971         dst[i * 2] = dst[i * 2 + 1] = src[i];
1972     }
1973 }
1974
1975 static void scale19To15Fw_c(int16_t *dst, const int32_t *src, int len)
1976 {
1977     int i;
1978     for (i = 0; i < len; i++) {
1979         dst[i] = src[i] >> 4;
1980     }
1981 }
1982
1983 // *** horizontal scale Y line to temp buffer
1984 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
1985                                      const uint8_t *src, int srcW, int xInc,
1986                                      const int16_t *hLumFilter,
1987                                      const int16_t *hLumFilterPos, int hLumFilterSize,
1988                                      uint8_t *formatConvBuffer,
1989                                      uint32_t *pal, int isAlpha)
1990 {
1991     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1992     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1993
1994     if (toYV12) {
1995         toYV12(formatConvBuffer, src, srcW, pal);
1996         src= formatConvBuffer;
1997     }
1998
1999     if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2000         c->scale8To16Rv((uint16_t *) formatConvBuffer, src, srcW);
2001         src = formatConvBuffer;
2002     }
2003
2004     if (!c->hyscale_fast) {
2005         c->hScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2006     } else { // fast bilinear upscale / crap downscale
2007         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2008     }
2009
2010     if (convertRange)
2011         convertRange(dst, dstWidth);
2012
2013     if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2014         c->scale19To15Fw(dst, (int32_t *) dst, dstWidth);
2015     }
2016 }
2017
2018 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2019                            int dstWidth, const uint8_t *src1,
2020                            const uint8_t *src2, int srcW, int xInc)
2021 {
2022     int i;
2023     unsigned int xpos=0;
2024     for (i=0;i<dstWidth;i++) {
2025         register unsigned int xx=xpos>>16;
2026         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2027         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2028         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2029         xpos+=xInc;
2030     }
2031 }
2032
2033 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2034                                      const uint8_t *src1, const uint8_t *src2,
2035                                      int srcW, int xInc, const int16_t *hChrFilter,
2036                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2037                                      uint8_t *formatConvBuffer, uint32_t *pal)
2038 {
2039     if (c->chrToYV12) {
2040         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW * c->scalingBpp >> 3, 16);
2041         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2042         src1= formatConvBuffer;
2043         src2= buf2;
2044     }
2045
2046     if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2047         uint8_t *buf2 = (formatConvBuffer + FFALIGN(srcW * 2, 16));
2048         c->scale8To16Rv((uint16_t *) formatConvBuffer, src1, srcW);
2049         c->scale8To16Rv((uint16_t *) buf2,             src2, srcW);
2050         src1 = formatConvBuffer;
2051         src2 = buf2;
2052     }
2053
2054     if (!c->hcscale_fast) {
2055         c->hScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2056         c->hScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2057     } else { // fast bilinear upscale / crap downscale
2058         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2059     }
2060
2061     if (c->chrConvertRange)
2062         c->chrConvertRange(dst1, dst2, dstWidth);
2063
2064     if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2065         c->scale19To15Fw(dst1, (int32_t *) dst1, dstWidth);
2066         c->scale19To15Fw(dst2, (int32_t *) dst2, dstWidth);
2067     }
2068 }
2069
2070 static av_always_inline void
2071 find_c_packed_planar_out_funcs(SwsContext *c,
2072                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2073                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2074                                yuv2packedX_fn *yuv2packedX)
2075 {
2076     enum PixelFormat dstFormat = c->dstFormat;
2077
2078     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2079         *yuv2yuvX     = yuv2nv12X_c;
2080     } else if (is16BPS(dstFormat)) {
2081         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2082     } else if (is9_OR_10BPS(dstFormat)) {
2083         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2084             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2085         } else {
2086             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2087         }
2088     } else {
2089         *yuv2yuv1     = yuv2yuv1_c;
2090         *yuv2yuvX     = yuv2yuvX_c;
2091     }
2092     if(c->flags & SWS_FULL_CHR_H_INT) {
2093         switch (dstFormat) {
2094             case PIX_FMT_RGBA:
2095 #if CONFIG_SMALL
2096                 *yuv2packedX = yuv2rgba32_full_X_c;
2097 #else
2098 #if CONFIG_SWSCALE_ALPHA
2099                 if (c->alpPixBuf) {
2100                     *yuv2packedX = yuv2rgba32_full_X_c;
2101                 } else
2102 #endif /* CONFIG_SWSCALE_ALPHA */
2103                 {
2104                     *yuv2packedX = yuv2rgbx32_full_X_c;
2105                 }
2106 #endif /* !CONFIG_SMALL */
2107                 break;
2108             case PIX_FMT_ARGB:
2109 #if CONFIG_SMALL
2110                 *yuv2packedX = yuv2argb32_full_X_c;
2111 #else
2112 #if CONFIG_SWSCALE_ALPHA
2113                 if (c->alpPixBuf) {
2114                     *yuv2packedX = yuv2argb32_full_X_c;
2115                 } else
2116 #endif /* CONFIG_SWSCALE_ALPHA */
2117                 {
2118                     *yuv2packedX = yuv2xrgb32_full_X_c;
2119                 }
2120 #endif /* !CONFIG_SMALL */
2121                 break;
2122             case PIX_FMT_BGRA:
2123 #if CONFIG_SMALL
2124                 *yuv2packedX = yuv2bgra32_full_X_c;
2125 #else
2126 #if CONFIG_SWSCALE_ALPHA
2127                 if (c->alpPixBuf) {
2128                     *yuv2packedX = yuv2bgra32_full_X_c;
2129                 } else
2130 #endif /* CONFIG_SWSCALE_ALPHA */
2131                 {
2132                     *yuv2packedX = yuv2bgrx32_full_X_c;
2133                 }
2134 #endif /* !CONFIG_SMALL */
2135                 break;
2136             case PIX_FMT_ABGR:
2137 #if CONFIG_SMALL
2138                 *yuv2packedX = yuv2abgr32_full_X_c;
2139 #else
2140 #if CONFIG_SWSCALE_ALPHA
2141                 if (c->alpPixBuf) {
2142                     *yuv2packedX = yuv2abgr32_full_X_c;
2143                 } else
2144 #endif /* CONFIG_SWSCALE_ALPHA */
2145                 {
2146                     *yuv2packedX = yuv2xbgr32_full_X_c;
2147                 }
2148 #endif /* !CONFIG_SMALL */
2149                 break;
2150             case PIX_FMT_RGB24:
2151             *yuv2packedX = yuv2rgb24_full_X_c;
2152             break;
2153         case PIX_FMT_BGR24:
2154             *yuv2packedX = yuv2bgr24_full_X_c;
2155             break;
2156         }
2157     } else {
2158         switch (dstFormat) {
2159         case PIX_FMT_GRAY16BE:
2160             *yuv2packed1 = yuv2gray16BE_1_c;
2161             *yuv2packed2 = yuv2gray16BE_2_c;
2162             *yuv2packedX = yuv2gray16BE_X_c;
2163             break;
2164         case PIX_FMT_GRAY16LE:
2165             *yuv2packed1 = yuv2gray16LE_1_c;
2166             *yuv2packed2 = yuv2gray16LE_2_c;
2167             *yuv2packedX = yuv2gray16LE_X_c;
2168             break;
2169         case PIX_FMT_MONOWHITE:
2170             *yuv2packed1 = yuv2monowhite_1_c;
2171             *yuv2packed2 = yuv2monowhite_2_c;
2172             *yuv2packedX = yuv2monowhite_X_c;
2173             break;
2174         case PIX_FMT_MONOBLACK:
2175             *yuv2packed1 = yuv2monoblack_1_c;
2176             *yuv2packed2 = yuv2monoblack_2_c;
2177             *yuv2packedX = yuv2monoblack_X_c;
2178             break;
2179         case PIX_FMT_YUYV422:
2180             *yuv2packed1 = yuv2yuyv422_1_c;
2181             *yuv2packed2 = yuv2yuyv422_2_c;
2182             *yuv2packedX = yuv2yuyv422_X_c;
2183             break;
2184         case PIX_FMT_UYVY422:
2185             *yuv2packed1 = yuv2uyvy422_1_c;
2186             *yuv2packed2 = yuv2uyvy422_2_c;
2187             *yuv2packedX = yuv2uyvy422_X_c;
2188             break;
2189         case PIX_FMT_RGB48LE:
2190             *yuv2packed1 = yuv2rgb48le_1_c;
2191             *yuv2packed2 = yuv2rgb48le_2_c;
2192             *yuv2packedX = yuv2rgb48le_X_c;
2193             break;
2194         case PIX_FMT_RGB48BE:
2195             *yuv2packed1 = yuv2rgb48be_1_c;
2196             *yuv2packed2 = yuv2rgb48be_2_c;
2197             *yuv2packedX = yuv2rgb48be_X_c;
2198             break;
2199         case PIX_FMT_BGR48LE:
2200             *yuv2packed1 = yuv2bgr48le_1_c;
2201             *yuv2packed2 = yuv2bgr48le_2_c;
2202             *yuv2packedX = yuv2bgr48le_X_c;
2203             break;
2204         case PIX_FMT_BGR48BE:
2205             *yuv2packed1 = yuv2bgr48be_1_c;
2206             *yuv2packed2 = yuv2bgr48be_2_c;
2207             *yuv2packedX = yuv2bgr48be_X_c;
2208             break;
2209         case PIX_FMT_RGB32:
2210         case PIX_FMT_BGR32:
2211 #if CONFIG_SMALL
2212             *yuv2packed1 = yuv2rgb32_1_c;
2213             *yuv2packed2 = yuv2rgb32_2_c;
2214             *yuv2packedX = yuv2rgb32_X_c;
2215 #else
2216 #if CONFIG_SWSCALE_ALPHA
2217                 if (c->alpPixBuf) {
2218                     *yuv2packed1 = yuv2rgba32_1_c;
2219                     *yuv2packed2 = yuv2rgba32_2_c;
2220                     *yuv2packedX = yuv2rgba32_X_c;
2221                 } else
2222 #endif /* CONFIG_SWSCALE_ALPHA */
2223                 {
2224                     *yuv2packed1 = yuv2rgbx32_1_c;
2225                     *yuv2packed2 = yuv2rgbx32_2_c;
2226                     *yuv2packedX = yuv2rgbx32_X_c;
2227                 }
2228 #endif /* !CONFIG_SMALL */
2229             break;
2230         case PIX_FMT_RGB32_1:
2231         case PIX_FMT_BGR32_1:
2232 #if CONFIG_SMALL
2233                 *yuv2packed1 = yuv2rgb32_1_1_c;
2234                 *yuv2packed2 = yuv2rgb32_1_2_c;
2235                 *yuv2packedX = yuv2rgb32_1_X_c;
2236 #else
2237 #if CONFIG_SWSCALE_ALPHA
2238                 if (c->alpPixBuf) {
2239                     *yuv2packed1 = yuv2rgba32_1_1_c;
2240                     *yuv2packed2 = yuv2rgba32_1_2_c;
2241                     *yuv2packedX = yuv2rgba32_1_X_c;
2242                 } else
2243 #endif /* CONFIG_SWSCALE_ALPHA */
2244                 {
2245                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2246                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2247                     *yuv2packedX = yuv2rgbx32_1_X_c;
2248                 }
2249 #endif /* !CONFIG_SMALL */
2250                 break;
2251         case PIX_FMT_RGB24:
2252             *yuv2packed1 = yuv2rgb24_1_c;
2253             *yuv2packed2 = yuv2rgb24_2_c;
2254             *yuv2packedX = yuv2rgb24_X_c;
2255             break;
2256         case PIX_FMT_BGR24:
2257             *yuv2packed1 = yuv2bgr24_1_c;
2258             *yuv2packed2 = yuv2bgr24_2_c;
2259             *yuv2packedX = yuv2bgr24_X_c;
2260             break;
2261         case PIX_FMT_RGB565LE:
2262         case PIX_FMT_RGB565BE:
2263         case PIX_FMT_BGR565LE:
2264         case PIX_FMT_BGR565BE:
2265             *yuv2packed1 = yuv2rgb16_1_c;
2266             *yuv2packed2 = yuv2rgb16_2_c;
2267             *yuv2packedX = yuv2rgb16_X_c;
2268             break;
2269         case PIX_FMT_RGB555LE:
2270         case PIX_FMT_RGB555BE:
2271         case PIX_FMT_BGR555LE:
2272         case PIX_FMT_BGR555BE:
2273             *yuv2packed1 = yuv2rgb15_1_c;
2274             *yuv2packed2 = yuv2rgb15_2_c;
2275             *yuv2packedX = yuv2rgb15_X_c;
2276             break;
2277         case PIX_FMT_RGB444LE:
2278         case PIX_FMT_RGB444BE:
2279         case PIX_FMT_BGR444LE:
2280         case PIX_FMT_BGR444BE:
2281             *yuv2packed1 = yuv2rgb12_1_c;
2282             *yuv2packed2 = yuv2rgb12_2_c;
2283             *yuv2packedX = yuv2rgb12_X_c;
2284             break;
2285         case PIX_FMT_RGB8:
2286         case PIX_FMT_BGR8:
2287             *yuv2packed1 = yuv2rgb8_1_c;
2288             *yuv2packed2 = yuv2rgb8_2_c;
2289             *yuv2packedX = yuv2rgb8_X_c;
2290             break;
2291         case PIX_FMT_RGB4:
2292         case PIX_FMT_BGR4:
2293             *yuv2packed1 = yuv2rgb4_1_c;
2294             *yuv2packed2 = yuv2rgb4_2_c;
2295             *yuv2packedX = yuv2rgb4_X_c;
2296             break;
2297         case PIX_FMT_RGB4_BYTE:
2298         case PIX_FMT_BGR4_BYTE:
2299             *yuv2packed1 = yuv2rgb4b_1_c;
2300             *yuv2packed2 = yuv2rgb4b_2_c;
2301             *yuv2packedX = yuv2rgb4b_X_c;
2302             break;
2303         }
2304     }
2305 }
2306
2307 #define DEBUG_SWSCALE_BUFFERS 0
2308 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2309
2310 static int swScale(SwsContext *c, const uint8_t* src[],
2311                    int srcStride[], int srcSliceY,
2312                    int srcSliceH, uint8_t* dst[], int dstStride[])
2313 {
2314     /* load a few things into local vars to make the code more readable? and faster */
2315     const int srcW= c->srcW;
2316     const int dstW= c->dstW;
2317     const int dstH= c->dstH;
2318     const int chrDstW= c->chrDstW;
2319     const int chrSrcW= c->chrSrcW;
2320     const int lumXInc= c->lumXInc;
2321     const int chrXInc= c->chrXInc;
2322     const enum PixelFormat dstFormat= c->dstFormat;
2323     const int flags= c->flags;
2324     int16_t *vLumFilterPos= c->vLumFilterPos;
2325     int16_t *vChrFilterPos= c->vChrFilterPos;
2326     int16_t *hLumFilterPos= c->hLumFilterPos;
2327     int16_t *hChrFilterPos= c->hChrFilterPos;
2328     int16_t *vLumFilter= c->vLumFilter;
2329     int16_t *vChrFilter= c->vChrFilter;
2330     int16_t *hLumFilter= c->hLumFilter;
2331     int16_t *hChrFilter= c->hChrFilter;
2332     int32_t *lumMmxFilter= c->lumMmxFilter;
2333     int32_t *chrMmxFilter= c->chrMmxFilter;
2334     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2335     const int vLumFilterSize= c->vLumFilterSize;
2336     const int vChrFilterSize= c->vChrFilterSize;
2337     const int hLumFilterSize= c->hLumFilterSize;
2338     const int hChrFilterSize= c->hChrFilterSize;
2339     int16_t **lumPixBuf= c->lumPixBuf;
2340     int16_t **chrUPixBuf= c->chrUPixBuf;
2341     int16_t **chrVPixBuf= c->chrVPixBuf;
2342     int16_t **alpPixBuf= c->alpPixBuf;
2343     const int vLumBufSize= c->vLumBufSize;
2344     const int vChrBufSize= c->vChrBufSize;
2345     uint8_t *formatConvBuffer= c->formatConvBuffer;
2346     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2347     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2348     int lastDstY;
2349     uint32_t *pal=c->pal_yuv;
2350     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2351     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2352     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2353     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2354     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2355
2356     /* vars which will change and which we need to store back in the context */
2357     int dstY= c->dstY;
2358     int lumBufIndex= c->lumBufIndex;
2359     int chrBufIndex= c->chrBufIndex;
2360     int lastInLumBuf= c->lastInLumBuf;
2361     int lastInChrBuf= c->lastInChrBuf;
2362
2363     if (isPacked(c->srcFormat)) {
2364         src[0]=
2365         src[1]=
2366         src[2]=
2367         src[3]= src[0];
2368         srcStride[0]=
2369         srcStride[1]=
2370         srcStride[2]=
2371         srcStride[3]= srcStride[0];
2372     }
2373     srcStride[1]<<= c->vChrDrop;
2374     srcStride[2]<<= c->vChrDrop;
2375
2376     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2377                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2378                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2379     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2380                    srcSliceY,    srcSliceH,    dstY,    dstH);
2381     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2382                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2383
2384     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2385         static int warnedAlready=0; //FIXME move this into the context perhaps
2386         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2387             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2388                    "         ->cannot do aligned memory accesses anymore\n");
2389             warnedAlready=1;
2390         }
2391     }
2392
2393     /* Note the user might start scaling the picture in the middle so this
2394        will not get executed. This is not really intended but works
2395        currently, so people might do it. */
2396     if (srcSliceY ==0) {
2397         lumBufIndex=-1;
2398         chrBufIndex=-1;
2399         dstY=0;
2400         lastInLumBuf= -1;
2401         lastInChrBuf= -1;
2402     }
2403
2404     lastDstY= dstY;
2405
2406     for (;dstY < dstH; dstY++) {
2407         const int chrDstY= dstY>>c->chrDstVSubSample;
2408         uint8_t *dest[4] = {
2409             dst[0] + dstStride[0] * dstY,
2410             dst[1] + dstStride[1] * chrDstY,
2411             dst[2] + dstStride[2] * chrDstY,
2412             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2413         };
2414
2415         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2416         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2417         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2418         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2419         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2420         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2421         int enough_lines;
2422
2423         //handle holes (FAST_BILINEAR & weird filters)
2424         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2425         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2426         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2427         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2428
2429         DEBUG_BUFFERS("dstY: %d\n", dstY);
2430         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2431                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2432         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2433                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2434
2435         // Do we have enough lines in this slice to output the dstY line
2436         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2437
2438         if (!enough_lines) {
2439             lastLumSrcY = srcSliceY + srcSliceH - 1;
2440             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2441             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2442                                             lastLumSrcY, lastChrSrcY);
2443         }
2444
2445         //Do horizontal scaling
2446         while(lastInLumBuf < lastLumSrcY) {
2447             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2448             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2449             lumBufIndex++;
2450             assert(lumBufIndex < 2*vLumBufSize);
2451             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2452             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2453             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2454                     hLumFilter, hLumFilterPos, hLumFilterSize,
2455                     formatConvBuffer,
2456                     pal, 0);
2457             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2458                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2459                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2460                         formatConvBuffer,
2461                         pal, 1);
2462             lastInLumBuf++;
2463             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2464                                lumBufIndex,    lastInLumBuf);
2465         }
2466         while(lastInChrBuf < lastChrSrcY) {
2467             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2468             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2469             chrBufIndex++;
2470             assert(chrBufIndex < 2*vChrBufSize);
2471             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2472             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2473             //FIXME replace parameters through context struct (some at least)
2474
2475             if (c->needs_hcscale)
2476                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2477                           chrDstW, src1, src2, chrSrcW, chrXInc,
2478                           hChrFilter, hChrFilterPos, hChrFilterSize,
2479                           formatConvBuffer, pal);
2480             lastInChrBuf++;
2481             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2482                                chrBufIndex,    lastInChrBuf);
2483         }
2484         //wrap buf index around to stay inside the ring buffer
2485         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2486         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2487         if (!enough_lines)
2488             break; //we can't output a dstY line so let's try with the next slice
2489
2490 #if HAVE_MMX
2491         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2492 #endif
2493         if (dstY >= dstH-2) {
2494             // hmm looks like we can't use MMX here without overwriting this array's tail
2495             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2496                                            &yuv2packed1, &yuv2packed2,
2497                                            &yuv2packedX);
2498         }
2499
2500         {
2501             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2502             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2503             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2504             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2505             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2506                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2507                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2508                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2509                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2510                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2511                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2512                              dest, dstW, chrDstW);
2513                 } else { //General YV12
2514                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2515                              lumSrcPtr, vLumFilterSize,
2516                              vChrFilter + chrDstY * vChrFilterSize,
2517                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2518                              alpSrcPtr, dest, dstW, chrDstW);
2519                 }
2520             } else {
2521                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2522                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2523                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2524                     int chrAlpha = vChrFilter[2 * dstY + 1];
2525                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2526                                 alpPixBuf ? *alpSrcPtr : NULL,
2527                                 dest[0], dstW, chrAlpha, dstY);
2528                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2529                     int lumAlpha = vLumFilter[2 * dstY + 1];
2530                     int chrAlpha = vChrFilter[2 * dstY + 1];
2531                     lumMmxFilter[2] =
2532                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2533                     chrMmxFilter[2] =
2534                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2535                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2536                                 alpPixBuf ? alpSrcPtr : NULL,
2537                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2538                 } else { //general RGB
2539                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2540                                 lumSrcPtr, vLumFilterSize,
2541                                 vChrFilter + dstY * vChrFilterSize,
2542                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2543                                 alpSrcPtr, dest[0], dstW, dstY);
2544                 }
2545             }
2546         }
2547     }
2548
2549     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2550         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2551
2552 #if HAVE_MMX2
2553     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2554         __asm__ volatile("sfence":::"memory");
2555 #endif
2556     emms_c();
2557
2558     /* store changed local vars back in the context */
2559     c->dstY= dstY;
2560     c->lumBufIndex= lumBufIndex;
2561     c->chrBufIndex= chrBufIndex;
2562     c->lastInLumBuf= lastInLumBuf;
2563     c->lastInChrBuf= lastInChrBuf;
2564
2565     return dstY - lastDstY;
2566 }
2567
2568 static av_cold void sws_init_swScale_c(SwsContext *c)
2569 {
2570     enum PixelFormat srcFormat = c->srcFormat;
2571
2572     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2573                                    &c->yuv2packed1, &c->yuv2packed2,
2574                                    &c->yuv2packedX);
2575
2576     c->chrToYV12 = NULL;
2577     switch(srcFormat) {
2578         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2579         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2580         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2581         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2582         case PIX_FMT_RGB8     :
2583         case PIX_FMT_BGR8     :
2584         case PIX_FMT_PAL8     :
2585         case PIX_FMT_BGR4_BYTE:
2586         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2587 #if HAVE_BIGENDIAN
2588         case PIX_FMT_YUV444P9LE:
2589         case PIX_FMT_YUV420P9LE:
2590         case PIX_FMT_YUV422P10LE:
2591         case PIX_FMT_YUV444P10LE:
2592         case PIX_FMT_YUV420P10LE:
2593         case PIX_FMT_YUV420P16LE:
2594         case PIX_FMT_YUV422P16LE:
2595         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2596 #else
2597         case PIX_FMT_YUV444P9BE:
2598         case PIX_FMT_YUV420P9BE:
2599         case PIX_FMT_YUV444P10BE:
2600         case PIX_FMT_YUV422P10BE:
2601         case PIX_FMT_YUV420P10BE:
2602         case PIX_FMT_YUV420P16BE:
2603         case PIX_FMT_YUV422P16BE:
2604         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2605 #endif
2606     }
2607     if (c->chrSrcHSubSample) {
2608         switch(srcFormat) {
2609         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2610         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2611         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2612         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2613         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2614         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2615         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2616         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2617         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2618         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2619         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2620         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2621         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2622         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2623         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2624         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2625         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2626         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2627         }
2628     } else {
2629         switch(srcFormat) {
2630         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2631         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2632         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2633         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2634         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2635         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2636         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2637         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2638         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2639         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2640         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2641         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2642         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2643         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2644         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2645         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2646         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2647         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2648         }
2649     }
2650
2651     c->lumToYV12 = NULL;
2652     c->alpToYV12 = NULL;
2653     switch (srcFormat) {
2654 #if HAVE_BIGENDIAN
2655     case PIX_FMT_YUV444P9LE:
2656     case PIX_FMT_YUV420P9LE:
2657     case PIX_FMT_YUV444P10LE:
2658     case PIX_FMT_YUV422P10LE:
2659     case PIX_FMT_YUV420P10LE:
2660     case PIX_FMT_YUV420P16LE:
2661     case PIX_FMT_YUV422P16LE:
2662     case PIX_FMT_YUV444P16LE:
2663     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2664 #else
2665     case PIX_FMT_YUV444P9BE:
2666     case PIX_FMT_YUV420P9BE:
2667     case PIX_FMT_YUV444P10BE:
2668     case PIX_FMT_YUV422P10BE:
2669     case PIX_FMT_YUV420P10BE:
2670     case PIX_FMT_YUV420P16BE:
2671     case PIX_FMT_YUV422P16BE:
2672     case PIX_FMT_YUV444P16BE:
2673     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2674 #endif
2675     case PIX_FMT_YUYV422  :
2676     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2677     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2678     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2679     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2680     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2681     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2682     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2683     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2684     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2685     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2686     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2687     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2688     case PIX_FMT_RGB8     :
2689     case PIX_FMT_BGR8     :
2690     case PIX_FMT_PAL8     :
2691     case PIX_FMT_BGR4_BYTE:
2692     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2693     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2694     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2695     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2696     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2697     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2698     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2699     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2700     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2701     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2702     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2703     }
2704     if (c->alpPixBuf) {
2705         switch (srcFormat) {
2706         case PIX_FMT_BGRA:
2707         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2708         case PIX_FMT_ABGR:
2709         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2710         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2711         }
2712     }
2713
2714     if (c->scalingBpp == 8) {
2715     c->hScale       = hScale_c;
2716     if (c->flags & SWS_FAST_BILINEAR) {
2717         c->hyscale_fast = hyscale_fast_c;
2718         c->hcscale_fast = hcscale_fast_c;
2719     }
2720
2721     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2722         if (c->srcRange) {
2723             c->lumConvertRange = lumRangeFromJpeg_c;
2724             c->chrConvertRange = chrRangeFromJpeg_c;
2725         } else {
2726             c->lumConvertRange = lumRangeToJpeg_c;
2727             c->chrConvertRange = chrRangeToJpeg_c;
2728         }
2729     }
2730     } else {
2731         c->hScale = hScale16_c;
2732         c->scale19To15Fw = scale19To15Fw_c;
2733         c->scale8To16Rv  = scale8To16Rv_c;
2734
2735         if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2736             if (c->srcRange) {
2737                 c->lumConvertRange = lumRangeFromJpeg16_c;
2738                 c->chrConvertRange = chrRangeFromJpeg16_c;
2739             } else {
2740                 c->lumConvertRange = lumRangeToJpeg16_c;
2741                 c->chrConvertRange = chrRangeToJpeg16_c;
2742             }
2743         }
2744     }
2745
2746     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2747           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2748         c->needs_hcscale = 1;
2749 }
2750
2751 SwsFunc ff_getSwsFunc(SwsContext *c)
2752 {
2753     sws_init_swScale_c(c);
2754
2755     if (HAVE_MMX)
2756         ff_sws_init_swScale_mmx(c);
2757     if (HAVE_ALTIVEC)
2758         ff_sws_init_swScale_altivec(c);
2759
2760     return swScale;
2761 }