git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185
 186 static av_always_inline void
 187 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 188                       int lumFilterSize, const int16_t *chrFilter,
 189                       const int32_t **chrUSrc, const int32_t **chrVSrc,
 190                       int chrFilterSize, const int32_t **alpSrc,
 191                       uint16_t *dest[4], int dstW, int chrDstW,
 192                       int big_endian, int output_bits)
 193 {
 194     //FIXME Optimize (just quickly written not optimized..)
 195     int i;
 196     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 197              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 198     int shift = 15 + 16 - output_bits;
 199
 200 #define output_pixel(pos, val) \
 201     if (big_endian) { \
 202         if (output_bits == 16) { \
 203             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 204         } else { \
 205             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 206         } \
 207     } else { \
 208         if (output_bits == 16) { \
 209             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 210         } else { \
 211             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 212         } \
 213     }
 214     for (i = 0; i < dstW; i++) {
 215         int val = 1 << (30-output_bits);
 216         int j;
 217
 218         for (j = 0; j < lumFilterSize; j++)
 219             val += lumSrc[j][i] * lumFilter[j];
 220
 221         output_pixel(&yDest[i], val);
 222     }
 223
 224     if (uDest) {
 225         for (i = 0; i < chrDstW; i++) {
 226             int u = 1 << (30-output_bits);
 227             int v = 1 << (30-output_bits);
 228             int j;
 229
 230             for (j = 0; j < chrFilterSize; j++) {
 231                 u += chrUSrc[j][i] * chrFilter[j];
 232                 v += chrVSrc[j][i] * chrFilter[j];
 233             }
 234
 235             output_pixel(&uDest[i], u);
 236             output_pixel(&vDest[i], v);
 237         }
 238     }
 239
 240     if (CONFIG_SWSCALE_ALPHA && aDest) {
 241         for (i = 0; i < dstW; i++) {
 242             int val = 1 << (30-output_bits);
 243             int j;
 244
 245             for (j = 0; j < lumFilterSize; j++)
 246                 val += alpSrc[j][i] * lumFilter[j];
 247
 248             output_pixel(&aDest[i], val);
 249         }
 250     }
 251 #undef output_pixel
 252 }
 253
 254 #define yuv2NBPS(bits, BE_LE, is_be) \
 255 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 256                               const int16_t **_lumSrc, int lumFilterSize, \
 257                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
 258                               const int16_t **_chrVSrc, \
 259                               int chrFilterSize, const int16_t **_alpSrc, \
 260                               uint8_t *_dest[4], int dstW, int chrDstW) \
 261 { \
 262     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 263                   **chrUSrc = (const int32_t **) _chrUSrc, \
 264                   **chrVSrc = (const int32_t **) _chrVSrc, \
 265                   **alpSrc  = (const int32_t **) _alpSrc; \
 266     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 267                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 268                           alpSrc, (uint16_t **) _dest, \
 269                           dstW, chrDstW, is_be, bits); \
 270 }
 271 yuv2NBPS( 9, BE, 1);
 272 yuv2NBPS( 9, LE, 0);
 273 yuv2NBPS(10, BE, 1);
 274 yuv2NBPS(10, LE, 0);
 275 yuv2NBPS(16, BE, 1);
 276 yuv2NBPS(16, LE, 0);
 277
 278 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 279                        const int16_t **lumSrc, int lumFilterSize,
 280                        const int16_t *chrFilter, const int16_t **chrUSrc,
 281                        const int16_t **chrVSrc,
 282                        int chrFilterSize, const int16_t **alpSrc,
 283                        uint8_t *dest[4], int dstW, int chrDstW)
 284 {
 285     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 286             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 287     int i;
 288
 289     //FIXME Optimize (just quickly written not optimized..)
 290     for (i=0; i<dstW; i++) {
 291         int val=1<<18;
 292         int j;
 293         for (j=0; j<lumFilterSize; j++)
 294             val += lumSrc[j][i] * lumFilter[j];
 295
 296         yDest[i]= av_clip_uint8(val>>19);
 297     }
 298
 299     if (uDest)
 300         for (i=0; i<chrDstW; i++) {
 301             int u=1<<18;
 302             int v=1<<18;
 303             int j;
 304             for (j=0; j<chrFilterSize; j++) {
 305                 u += chrUSrc[j][i] * chrFilter[j];
 306                 v += chrVSrc[j][i] * chrFilter[j];
 307             }
 308
 309             uDest[i]= av_clip_uint8(u>>19);
 310             vDest[i]= av_clip_uint8(v>>19);
 311         }
 312
 313     if (CONFIG_SWSCALE_ALPHA && aDest)
 314         for (i=0; i<dstW; i++) {
 315             int val=1<<18;
 316             int j;
 317             for (j=0; j<lumFilterSize; j++)
 318                 val += alpSrc[j][i] * lumFilter[j];
 319
 320             aDest[i]= av_clip_uint8(val>>19);
 321         }
 322 }
 323
 324 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 325                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 326                        const int16_t *alpSrc,
 327                        uint8_t *dest[4], int dstW, int chrDstW)
 328 {
 329     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 330             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 331     int i;
 332
 333     for (i=0; i<dstW; i++) {
 334         int val= (lumSrc[i]+64)>>7;
 335         yDest[i]= av_clip_uint8(val);
 336     }
 337
 338     if (uDest)
 339         for (i=0; i<chrDstW; i++) {
 340             int u=(chrUSrc[i]+64)>>7;
 341             int v=(chrVSrc[i]+64)>>7;
 342             uDest[i]= av_clip_uint8(u);
 343             vDest[i]= av_clip_uint8(v);
 344         }
 345
 346     if (CONFIG_SWSCALE_ALPHA && aDest)
 347         for (i=0; i<dstW; i++) {
 348             int val= (alpSrc[i]+64)>>7;
 349             aDest[i]= av_clip_uint8(val);
 350         }
 351 }
 352
 353 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 354                         const int16_t **lumSrc, int lumFilterSize,
 355                         const int16_t *chrFilter, const int16_t **chrUSrc,
 356                         const int16_t **chrVSrc, int chrFilterSize,
 357                         const int16_t **alpSrc, uint8_t *dest[4],
 358                         int dstW, int chrDstW)
 359 {
 360     uint8_t *yDest = dest[0], *uDest = dest[1];
 361     enum PixelFormat dstFormat = c->dstFormat;
 362
 363     //FIXME Optimize (just quickly written not optimized..)
 364     int i;
 365     for (i=0; i<dstW; i++) {
 366         int val=1<<18;
 367         int j;
 368         for (j=0; j<lumFilterSize; j++)
 369             val += lumSrc[j][i] * lumFilter[j];
 370
 371         yDest[i]= av_clip_uint8(val>>19);
 372     }
 373
 374     if (!uDest)
 375         return;
 376
 377     if (dstFormat == PIX_FMT_NV12)
 378         for (i=0; i<chrDstW; i++) {
 379             int u=1<<18;
 380             int v=1<<18;
 381             int j;
 382             for (j=0; j<chrFilterSize; j++) {
 383                 u += chrUSrc[j][i] * chrFilter[j];
 384                 v += chrVSrc[j][i] * chrFilter[j];
 385             }
 386
 387             uDest[2*i]= av_clip_uint8(u>>19);
 388             uDest[2*i+1]= av_clip_uint8(v>>19);
 389         }
 390     else
 391         for (i=0; i<chrDstW; i++) {
 392             int u=1<<18;
 393             int v=1<<18;
 394             int j;
 395             for (j=0; j<chrFilterSize; j++) {
 396                 u += chrUSrc[j][i] * chrFilter[j];
 397                 v += chrVSrc[j][i] * chrFilter[j];
 398             }
 399
 400             uDest[2*i]= av_clip_uint8(v>>19);
 401             uDest[2*i+1]= av_clip_uint8(u>>19);
 402         }
 403 }
 404
 405 #define output_pixel(pos, val) \
 406         if (target == PIX_FMT_GRAY16BE) { \
 407             AV_WB16(pos, val); \
 408         } else { \
 409             AV_WL16(pos, val); \
 410         }
 411
 412 static av_always_inline void
 413 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 414                         const int32_t **lumSrc, int lumFilterSize,
 415                         const int16_t *chrFilter, const int32_t **chrUSrc,
 416                         const int32_t **chrVSrc, int chrFilterSize,
 417                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 418                         int y, enum PixelFormat target)
 419 {
 420     int i;
 421
 422     for (i = 0; i < (dstW >> 1); i++) {
 423         int j;
 424         int Y1 = 1 << 14;
 425         int Y2 = 1 << 14;
 426
 427         for (j = 0; j < lumFilterSize; j++) {
 428             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 429             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 430         }
 431         Y1 >>= 15;
 432         Y2 >>= 15;
 433         if ((Y1 | Y2) & 0x10000) {
 434             Y1 = av_clip_uint16(Y1);
 435             Y2 = av_clip_uint16(Y2);
 436         }
 437         output_pixel(&dest[i * 2 + 0], Y1);
 438         output_pixel(&dest[i * 2 + 1], Y2);
 439     }
 440 }
 441
 442 static av_always_inline void
 443 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 444                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 445                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 446                         int yalpha, int uvalpha, int y,
 447                         enum PixelFormat target)
 448 {
 449     int  yalpha1 = 4095 - yalpha;
 450     int i;
 451     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 452
 453     for (i = 0; i < (dstW >> 1); i++) {
 454         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 455         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 456
 457         output_pixel(&dest[i * 2 + 0], Y1);
 458         output_pixel(&dest[i * 2 + 1], Y2);
 459     }
 460 }
 461
 462 static av_always_inline void
 463 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 464                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 465                         const int32_t *abuf0, uint16_t *dest, int dstW,
 466                         int uvalpha, int y, enum PixelFormat target)
 467 {
 468     int i;
 469
 470     for (i = 0; i < (dstW >> 1); i++) {
 471         int Y1 = buf0[i * 2    ] << 1;
 472         int Y2 = buf0[i * 2 + 1] << 1;
 473
 474         output_pixel(&dest[i * 2 + 0], Y1);
 475         output_pixel(&dest[i * 2 + 1], Y2);
 476     }
 477 }
 478
 479 #undef output_pixel
 480
 481 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 482 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 483                         const int16_t **_lumSrc, int lumFilterSize, \
 484                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 485                         const int16_t **_chrVSrc, int chrFilterSize, \
 486                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 487                         int y) \
 488 { \
 489     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 490                   **chrUSrc = (const int32_t **) _chrUSrc, \
 491                   **chrVSrc = (const int32_t **) _chrVSrc, \
 492                   **alpSrc  = (const int32_t **) _alpSrc; \
 493     uint16_t *dest = (uint16_t *) _dest; \
 494     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 495                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 496                           alpSrc, dest, dstW, y, fmt); \
 497 } \
 498  \
 499 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 500                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 501                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 502                         int yalpha, int uvalpha, int y) \
 503 { \
 504     const int32_t **buf  = (const int32_t **) _buf, \
 505                   **ubuf = (const int32_t **) _ubuf, \
 506                   **vbuf = (const int32_t **) _vbuf, \
 507                   **abuf = (const int32_t **) _abuf; \
 508     uint16_t *dest = (uint16_t *) _dest; \
 509     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 510                           dest, dstW, yalpha, uvalpha, y, fmt); \
 511 } \
 512  \
 513 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 514                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 515                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 516                         int uvalpha, int y) \
 517 { \
 518     const int32_t *buf0  = (const int32_t *)  _buf0, \
 519                  **ubuf  = (const int32_t **) _ubuf, \
 520                  **vbuf  = (const int32_t **) _vbuf, \
 521                   *abuf0 = (const int32_t *)  _abuf0; \
 522     uint16_t *dest = (uint16_t *) _dest; \
 523     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 524                                   dstW, uvalpha, y, fmt); \
 525 }
 526
 527 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 528 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 529
 530 #define output_pixel(pos, acc) \
 531     if (target == PIX_FMT_MONOBLACK) { \
 532         pos = acc; \
 533     } else { \
 534         pos = ~acc; \
 535     }
 536
 537 static av_always_inline void
 538 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 539                       const int16_t **lumSrc, int lumFilterSize,
 540                       const int16_t *chrFilter, const int16_t **chrUSrc,
 541                       const int16_t **chrVSrc, int chrFilterSize,
 542                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 543                       int y, enum PixelFormat target)
 544 {
 545     const uint8_t * const d128=dither_8x8_220[y&7];
 546     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 547     int i;
 548     int acc = 0;
 549
 550     for (i = 0; i < dstW - 1; i += 2) {
 551         int j;
 552         int Y1 = 1 << 18;
 553         int Y2 = 1 << 18;
 554
 555         for (j = 0; j < lumFilterSize; j++) {
 556             Y1 += lumSrc[j][i]   * lumFilter[j];
 557             Y2 += lumSrc[j][i+1] * lumFilter[j];
 558         }
 559         Y1 >>= 19;
 560         Y2 >>= 19;
 561         if ((Y1 | Y2) & 0x100) {
 562             Y1 = av_clip_uint8(Y1);
 563             Y2 = av_clip_uint8(Y2);
 564         }
 565         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 566         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 567         if ((i & 7) == 6) {
 568             output_pixel(*dest++, acc);
 569         }
 570     }
 571 }
 572
 573 static av_always_inline void
 574 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 575                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 576                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 577                       int yalpha, int uvalpha, int y,
 578                       enum PixelFormat target)
 579 {
 580     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 581     const uint8_t * const d128 = dither_8x8_220[y & 7];
 582     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 583     int  yalpha1 = 4095 - yalpha;
 584     int i;
 585
 586     for (i = 0; i < dstW - 7; i += 8) {
 587         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 588         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 589         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 590         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 591         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 592         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 593         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 594         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 595         output_pixel(*dest++, acc);
 596     }
 597 }
 598
 599 static av_always_inline void
 600 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 601                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 602                       const int16_t *abuf0, uint8_t *dest, int dstW,
 603                       int uvalpha, int y, enum PixelFormat target)
 604 {
 605     const uint8_t * const d128 = dither_8x8_220[y & 7];
 606     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 607     int i;
 608
 609     for (i = 0; i < dstW - 7; i += 8) {
 610         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 611         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 612         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 613         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 614         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 615         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 616         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 617         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 618         output_pixel(*dest++, acc);
 619     }
 620 }
 621
 622 #undef output_pixel
 623
 624 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 625 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 626                                 const int16_t **lumSrc, int lumFilterSize, \
 627                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 628                                 const int16_t **chrVSrc, int chrFilterSize, \
 629                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 630                                 int y) \
 631 { \
 632     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 633                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 634                                   alpSrc, dest, dstW, y, fmt); \
 635 } \
 636  \
 637 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 638                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 639                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 640                                 int yalpha, int uvalpha, int y) \
 641 { \
 642     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 643                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 644 } \
 645  \
 646 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 647                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 648                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 649                                 int uvalpha, int y) \
 650 { \
 651     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 652                                   abuf0, dest, dstW, uvalpha, \
 653                                   y, fmt); \
 654 }
 655
 656 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 657 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 658
 659 #define output_pixels(pos, Y1, U, Y2, V) \
 660     if (target == PIX_FMT_YUYV422) { \
 661         dest[pos + 0] = Y1; \
 662         dest[pos + 1] = U;  \
 663         dest[pos + 2] = Y2; \
 664         dest[pos + 3] = V;  \
 665     } else { \
 666         dest[pos + 0] = U;  \
 667         dest[pos + 1] = Y1; \
 668         dest[pos + 2] = V;  \
 669         dest[pos + 3] = Y2; \
 670     }
 671
 672 static av_always_inline void
 673 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 674                      const int16_t **lumSrc, int lumFilterSize,
 675                      const int16_t *chrFilter, const int16_t **chrUSrc,
 676                      const int16_t **chrVSrc, int chrFilterSize,
 677                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 678                      int y, enum PixelFormat target)
 679 {
 680     int i;
 681
 682     for (i = 0; i < (dstW >> 1); i++) {
 683         int j;
 684         int Y1 = 1 << 18;
 685         int Y2 = 1 << 18;
 686         int U  = 1 << 18;
 687         int V  = 1 << 18;
 688
 689         for (j = 0; j < lumFilterSize; j++) {
 690             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 691             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 692         }
 693         for (j = 0; j < chrFilterSize; j++) {
 694             U += chrUSrc[j][i] * chrFilter[j];
 695             V += chrVSrc[j][i] * chrFilter[j];
 696         }
 697         Y1 >>= 19;
 698         Y2 >>= 19;
 699         U  >>= 19;
 700         V  >>= 19;
 701         if ((Y1 | Y2 | U | V) & 0x100) {
 702             Y1 = av_clip_uint8(Y1);
 703             Y2 = av_clip_uint8(Y2);
 704             U  = av_clip_uint8(U);
 705             V  = av_clip_uint8(V);
 706         }
 707         output_pixels(4*i, Y1, U, Y2, V);
 708     }
 709 }
 710
 711 static av_always_inline void
 712 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 713                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 714                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 715                      int yalpha, int uvalpha, int y,
 716                      enum PixelFormat target)
 717 {
 718     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 719                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 720                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 721     int  yalpha1 = 4095 - yalpha;
 722     int uvalpha1 = 4095 - uvalpha;
 723     int i;
 724
 725     for (i = 0; i < (dstW >> 1); i++) {
 726         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 727         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 728         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 729         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 730
 731         output_pixels(i * 4, Y1, U, Y2, V);
 732     }
 733 }
 734
 735 static av_always_inline void
 736 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 737                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 738                      const int16_t *abuf0, uint8_t *dest, int dstW,
 739                      int uvalpha, int y, enum PixelFormat target)
 740 {
 741     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 742                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 743     int i;
 744
 745     if (uvalpha < 2048) {
 746         for (i = 0; i < (dstW >> 1); i++) {
 747             int Y1 = buf0[i * 2]     >> 7;
 748             int Y2 = buf0[i * 2 + 1] >> 7;
 749             int U  = ubuf1[i]        >> 7;
 750             int V  = vbuf1[i]        >> 7;
 751
 752             output_pixels(i * 4, Y1, U, Y2, V);
 753         }
 754     } else {
 755         for (i = 0; i < (dstW >> 1); i++) {
 756             int Y1 =  buf0[i * 2]          >> 7;
 757             int Y2 =  buf0[i * 2 + 1]      >> 7;
 758             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 759             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 760
 761             output_pixels(i * 4, Y1, U, Y2, V);
 762         }
 763     }
 764 }
 765
 766 #undef output_pixels
 767
 768 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 769 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 770
 771 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 772 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 773 #define output_pixel(pos, val) \
 774     if (isBE(target)) { \
 775         AV_WB16(pos, val); \
 776     } else { \
 777         AV_WL16(pos, val); \
 778     }
 779
 780 static av_always_inline void
 781 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 782                        const int32_t **lumSrc, int lumFilterSize,
 783                        const int16_t *chrFilter, const int32_t **chrUSrc,
 784                        const int32_t **chrVSrc, int chrFilterSize,
 785                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 786                        int y, enum PixelFormat target)
 787 {
 788     int i;
 789
 790     for (i = 0; i < (dstW >> 1); i++) {
 791         int j;
 792         int Y1 = 0;
 793         int Y2 = 0;
 794         int U  = -128 << 23; // 19
 795         int V  = -128 << 23;
 796         int R, G, B;
 797
 798         for (j = 0; j < lumFilterSize; j++) {
 799             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 800             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 801         }
 802         for (j = 0; j < chrFilterSize; j++) {
 803             U += chrUSrc[j][i] * chrFilter[j];
 804             V += chrVSrc[j][i] * chrFilter[j];
 805         }
 806
 807         // 8bit: 12+15=27; 16-bit: 12+19=31
 808         Y1 >>= 14; // 10
 809         Y2 >>= 14;
 810         U  >>= 14;
 811         V  >>= 14;
 812
 813         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 814         Y1 -= c->yuv2rgb_y_offset;
 815         Y2 -= c->yuv2rgb_y_offset;
 816         Y1 *= c->yuv2rgb_y_coeff;
 817         Y2 *= c->yuv2rgb_y_coeff;
 818         Y1 += 1 << 13; // 21
 819         Y2 += 1 << 13;
 820         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 821
 822         R = V * c->yuv2rgb_v2r_coeff;
 823         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 824         B =                            U * c->yuv2rgb_u2b_coeff;
 825
 826         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 827         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 828         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 829         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 830         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 831         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 832         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 833         dest += 6;
 834     }
 835 }
 836
 837 static av_always_inline void
 838 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 839                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 840                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 841                        int yalpha, int uvalpha, int y,
 842                        enum PixelFormat target)
 843 {
 844     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 845                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 846                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 847     int  yalpha1 = 4095 - yalpha;
 848     int uvalpha1 = 4095 - uvalpha;
 849     int i;
 850
 851     for (i = 0; i < (dstW >> 1); i++) {
 852         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 853         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 854         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 855         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 856         int R, G, B;
 857
 858         Y1 -= c->yuv2rgb_y_offset;
 859         Y2 -= c->yuv2rgb_y_offset;
 860         Y1 *= c->yuv2rgb_y_coeff;
 861         Y2 *= c->yuv2rgb_y_coeff;
 862         Y1 += 1 << 13;
 863         Y2 += 1 << 13;
 864
 865         R = V * c->yuv2rgb_v2r_coeff;
 866         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 867         B =                            U * c->yuv2rgb_u2b_coeff;
 868
 869         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 870         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 871         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 872         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 873         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 874         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 875         dest += 6;
 876     }
 877 }
 878
 879 static av_always_inline void
 880 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 881                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 882                        const int32_t *abuf0, uint16_t *dest, int dstW,
 883                        int uvalpha, int y, enum PixelFormat target)
 884 {
 885     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 886                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 887     int i;
 888
 889     if (uvalpha < 2048) {
 890         for (i = 0; i < (dstW >> 1); i++) {
 891             int Y1 = (buf0[i * 2]    ) >> 2;
 892             int Y2 = (buf0[i * 2 + 1]) >> 2;
 893             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 894             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 895             int R, G, B;
 896
 897             Y1 -= c->yuv2rgb_y_offset;
 898             Y2 -= c->yuv2rgb_y_offset;
 899             Y1 *= c->yuv2rgb_y_coeff;
 900             Y2 *= c->yuv2rgb_y_coeff;
 901             Y1 += 1 << 13;
 902             Y2 += 1 << 13;
 903
 904             R = V * c->yuv2rgb_v2r_coeff;
 905             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 906             B =                            U * c->yuv2rgb_u2b_coeff;
 907
 908             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 909             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 910             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 911             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 912             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 913             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 914             dest += 6;
 915         }
 916     } else {
 917         for (i = 0; i < (dstW >> 1); i++) {
 918             int Y1 = (buf0[i * 2]    ) >> 2;
 919             int Y2 = (buf0[i * 2 + 1]) >> 2;
 920             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 11)) >> 3;
 921             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 11)) >> 3;
 922             int R, G, B;
 923
 924             Y1 -= c->yuv2rgb_y_offset;
 925             Y2 -= c->yuv2rgb_y_offset;
 926             Y1 *= c->yuv2rgb_y_coeff;
 927             Y2 *= c->yuv2rgb_y_coeff;
 928             Y1 += 1 << 13;
 929             Y2 += 1 << 13;
 930
 931             R = V * c->yuv2rgb_v2r_coeff;
 932             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 933             B =                            U * c->yuv2rgb_u2b_coeff;
 934
 935             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 936             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 937             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 938             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 939             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 940             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 941             dest += 6;
 942         }
 943     }
 944 }
 945
 946 #undef output_pixel
 947 #undef r_b
 948 #undef b_r
 949
 950 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
 951 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
 952 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
 953 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
 954
 955 static av_always_inline void
 956 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
 957               int U, int V, int A1, int A2,
 958               const void *_r, const void *_g, const void *_b, int y,
 959               enum PixelFormat target, int hasAlpha)
 960 {
 961     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 962         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
 963         uint32_t *dest = (uint32_t *) _dest;
 964         const uint32_t *r = (const uint32_t *) _r;
 965         const uint32_t *g = (const uint32_t *) _g;
 966         const uint32_t *b = (const uint32_t *) _b;
 967
 968 #if CONFIG_SMALL
 969         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 970
 971         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
 972         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
 973 #else
 974         if (hasAlpha) {
 975             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
 976
 977             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
 978             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
 979         } else {
 980             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
 981             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
 982         }
 983 #endif
 984     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
 985         uint8_t *dest = (uint8_t *) _dest;
 986         const uint8_t *r = (const uint8_t *) _r;
 987         const uint8_t *g = (const uint8_t *) _g;
 988         const uint8_t *b = (const uint8_t *) _b;
 989
 990 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
 991 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
 992         dest[i * 6 + 0] = r_b[Y1];
 993         dest[i * 6 + 1] =   g[Y1];
 994         dest[i * 6 + 2] = b_r[Y1];
 995         dest[i * 6 + 3] = r_b[Y2];
 996         dest[i * 6 + 4] =   g[Y2];
 997         dest[i * 6 + 5] = b_r[Y2];
 998 #undef r_b
 999 #undef b_r
1000     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1001                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1002                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1003         uint16_t *dest = (uint16_t *) _dest;
1004         const uint16_t *r = (const uint16_t *) _r;
1005         const uint16_t *g = (const uint16_t *) _g;
1006         const uint16_t *b = (const uint16_t *) _b;
1007         int dr1, dg1, db1, dr2, dg2, db2;
1008
1009         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1010             dr1 = dither_2x2_8[ y & 1     ][0];
1011             dg1 = dither_2x2_4[ y & 1     ][0];
1012             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1013             dr2 = dither_2x2_8[ y & 1     ][1];
1014             dg2 = dither_2x2_4[ y & 1     ][1];
1015             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1016         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1017             dr1 = dither_2x2_8[ y & 1     ][0];
1018             dg1 = dither_2x2_8[ y & 1     ][1];
1019             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1020             dr2 = dither_2x2_8[ y & 1     ][1];
1021             dg2 = dither_2x2_8[ y & 1     ][0];
1022             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1023         } else {
1024             dr1 = dither_4x4_16[ y & 3     ][0];
1025             dg1 = dither_4x4_16[ y & 3     ][1];
1026             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1027             dr2 = dither_4x4_16[ y & 3     ][1];
1028             dg2 = dither_4x4_16[ y & 3     ][0];
1029             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1030         }
1031
1032         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1033         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1034     } else /* 8/4-bit */ {
1035         uint8_t *dest = (uint8_t *) _dest;
1036         const uint8_t *r = (const uint8_t *) _r;
1037         const uint8_t *g = (const uint8_t *) _g;
1038         const uint8_t *b = (const uint8_t *) _b;
1039         int dr1, dg1, db1, dr2, dg2, db2;
1040
1041         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1042             const uint8_t * const d64 = dither_8x8_73[y & 7];
1043             const uint8_t * const d32 = dither_8x8_32[y & 7];
1044             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1045             db1 =       d64[(i * 2 + 0) & 7];
1046             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1047             db2 =       d64[(i * 2 + 1) & 7];
1048         } else {
1049             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1050             const uint8_t * const d128 = dither_8x8_220[y & 7];
1051             dr1 = db1 = d128[(i * 2 + 0) & 7];
1052             dg1 =        d64[(i * 2 + 0) & 7];
1053             dr2 = db2 = d128[(i * 2 + 1) & 7];
1054             dg2 =        d64[(i * 2 + 1) & 7];
1055         }
1056
1057         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1058             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1059                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1060         } else {
1061             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1062             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1063         }
1064     }
1065 }
1066
1067 static av_always_inline void
1068 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1069                      const int16_t **lumSrc, int lumFilterSize,
1070                      const int16_t *chrFilter, const int16_t **chrUSrc,
1071                      const int16_t **chrVSrc, int chrFilterSize,
1072                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1073                      int y, enum PixelFormat target, int hasAlpha)
1074 {
1075     int i;
1076
1077     for (i = 0; i < (dstW >> 1); i++) {
1078         int j;
1079         int Y1 = 1 << 18;
1080         int Y2 = 1 << 18;
1081         int U  = 1 << 18;
1082         int V  = 1 << 18;
1083         int av_unused A1, A2;
1084         const void *r, *g, *b;
1085
1086         for (j = 0; j < lumFilterSize; j++) {
1087             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1088             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1089         }
1090         for (j = 0; j < chrFilterSize; j++) {
1091             U += chrUSrc[j][i] * chrFilter[j];
1092             V += chrVSrc[j][i] * chrFilter[j];
1093         }
1094         Y1 >>= 19;
1095         Y2 >>= 19;
1096         U  >>= 19;
1097         V  >>= 19;
1098         if ((Y1 | Y2 | U | V) & 0x100) {
1099             Y1 = av_clip_uint8(Y1);
1100             Y2 = av_clip_uint8(Y2);
1101             U  = av_clip_uint8(U);
1102             V  = av_clip_uint8(V);
1103         }
1104         if (hasAlpha) {
1105             A1 = 1 << 18;
1106             A2 = 1 << 18;
1107             for (j = 0; j < lumFilterSize; j++) {
1108                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1109                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1110             }
1111             A1 >>= 19;
1112             A2 >>= 19;
1113             if ((A1 | A2) & 0x100) {
1114                 A1 = av_clip_uint8(A1);
1115                 A2 = av_clip_uint8(A2);
1116             }
1117         }
1118
1119         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1120         r =  c->table_rV[V];
1121         g = (c->table_gU[U] + c->table_gV[V]);
1122         b =  c->table_bU[U];
1123
1124         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1125                       r, g, b, y, target, hasAlpha);
1126     }
1127 }
1128
1129 static av_always_inline void
1130 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1131                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1132                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1133                      int yalpha, int uvalpha, int y,
1134                      enum PixelFormat target, int hasAlpha)
1135 {
1136     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1137                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1138                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1139                   *abuf0 = abuf[0], *abuf1 = abuf[1];
1140     int  yalpha1 = 4095 - yalpha;
1141     int uvalpha1 = 4095 - uvalpha;
1142     int i;
1143
1144     for (i = 0; i < (dstW >> 1); i++) {
1145         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1146         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1147         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1148         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1149         int A1, A2;
1150         const void *r =  c->table_rV[V],
1151                    *g = (c->table_gU[U] + c->table_gV[V]),
1152                    *b =  c->table_bU[U];
1153
1154         if (hasAlpha) {
1155             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1156             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1157         }
1158
1159         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1160                       r, g, b, y, target, hasAlpha);
1161     }
1162 }
1163
1164 static av_always_inline void
1165 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1166                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1167                      const int16_t *abuf0, uint8_t *dest, int dstW,
1168                      int uvalpha, int y, enum PixelFormat target,
1169                      int hasAlpha)
1170 {
1171     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1172                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1173     int i;
1174
1175     if (uvalpha < 2048) {
1176         for (i = 0; i < (dstW >> 1); i++) {
1177             int Y1 = buf0[i * 2]     >> 7;
1178             int Y2 = buf0[i * 2 + 1] >> 7;
1179             int U  = ubuf1[i]        >> 7;
1180             int V  = vbuf1[i]        >> 7;
1181             int A1, A2;
1182             const void *r =  c->table_rV[V],
1183                        *g = (c->table_gU[U] + c->table_gV[V]),
1184                        *b =  c->table_bU[U];
1185
1186             if (hasAlpha) {
1187                 A1 = abuf0[i * 2    ] >> 7;
1188                 A2 = abuf0[i * 2 + 1] >> 7;
1189             }
1190
1191             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1192                           r, g, b, y, target, hasAlpha);
1193         }
1194     } else {
1195         for (i = 0; i < (dstW >> 1); i++) {
1196             int Y1 =  buf0[i * 2]          >> 7;
1197             int Y2 =  buf0[i * 2 + 1]      >> 7;
1198             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1199             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1200             int A1, A2;
1201             const void *r =  c->table_rV[V],
1202                        *g = (c->table_gU[U] + c->table_gV[V]),
1203                        *b =  c->table_bU[U];
1204
1205             if (hasAlpha) {
1206                 A1 = abuf0[i * 2    ] >> 7;
1207                 A2 = abuf0[i * 2 + 1] >> 7;
1208             }
1209
1210             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1211                           r, g, b, y, target, hasAlpha);
1212         }
1213     }
1214 }
1215
1216 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1217 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1218                                 const int16_t **lumSrc, int lumFilterSize, \
1219                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1220                                 const int16_t **chrVSrc, int chrFilterSize, \
1221                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1222                                 int y) \
1223 { \
1224     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1225                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1226                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1227 }
1228 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1229 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1230 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1231                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1232                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1233                                 int yalpha, int uvalpha, int y) \
1234 { \
1235     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1236                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1237 } \
1238  \
1239 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1240                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1241                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1242                                 int uvalpha, int y) \
1243 { \
1244     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1245                                   dstW, uvalpha, y, fmt, hasAlpha); \
1246 }
1247
1248 #if CONFIG_SMALL
1249 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1250 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1251 #else
1252 #if CONFIG_SWSCALE_ALPHA
1253 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1254 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1255 #endif
1256 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1257 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1258 #endif
1259 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1260 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1261 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1262 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1263 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1264 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1265 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1266 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1267
1268 static av_always_inline void
1269 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1270                           const int16_t **lumSrc, int lumFilterSize,
1271                           const int16_t *chrFilter, const int16_t **chrUSrc,
1272                           const int16_t **chrVSrc, int chrFilterSize,
1273                           const int16_t **alpSrc, uint8_t *dest,
1274                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1275 {
1276     int i;
1277     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1278
1279     for (i = 0; i < dstW; i++) {
1280         int j;
1281         int Y = 0;
1282         int U = -128 << 19;
1283         int V = -128 << 19;
1284         int av_unused A;
1285         int R, G, B;
1286
1287         for (j = 0; j < lumFilterSize; j++) {
1288             Y += lumSrc[j][i] * lumFilter[j];
1289         }
1290         for (j = 0; j < chrFilterSize; j++) {
1291             U += chrUSrc[j][i] * chrFilter[j];
1292             V += chrVSrc[j][i] * chrFilter[j];
1293         }
1294         Y >>= 10;
1295         U >>= 10;
1296         V >>= 10;
1297         if (hasAlpha) {
1298             A = 1 << 21;
1299             for (j = 0; j < lumFilterSize; j++) {
1300                 A += alpSrc[j][i] * lumFilter[j];
1301             }
1302             A >>= 19;
1303             if (A & 0x100)
1304                 A = av_clip_uint8(A);
1305         }
1306         Y -= c->yuv2rgb_y_offset;
1307         Y *= c->yuv2rgb_y_coeff;
1308         Y += 1 << 21;
1309         R = Y + V*c->yuv2rgb_v2r_coeff;
1310         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1311         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1312         if ((R | G | B) & 0xC0000000) {
1313             R = av_clip_uintp2(R, 30);
1314             G = av_clip_uintp2(G, 30);
1315             B = av_clip_uintp2(B, 30);
1316         }
1317
1318         switch(target) {
1319         case PIX_FMT_ARGB:
1320             dest[0] = hasAlpha ? A : 255;
1321             dest[1] = R >> 22;
1322             dest[2] = G >> 22;
1323             dest[3] = B >> 22;
1324             break;
1325         case PIX_FMT_RGB24:
1326             dest[0] = R >> 22;
1327             dest[1] = G >> 22;
1328             dest[2] = B >> 22;
1329             break;
1330         case PIX_FMT_RGBA:
1331             dest[0] = R >> 22;
1332             dest[1] = G >> 22;
1333             dest[2] = B >> 22;
1334             dest[3] = hasAlpha ? A : 255;
1335             break;
1336         case PIX_FMT_ABGR:
1337             dest[0] = hasAlpha ? A : 255;
1338             dest[1] = B >> 22;
1339             dest[2] = G >> 22;
1340             dest[3] = R >> 22;
1341             dest += 4;
1342             break;
1343         case PIX_FMT_BGR24:
1344             dest[0] = B >> 22;
1345             dest[1] = G >> 22;
1346             dest[2] = R >> 22;
1347             break;
1348         case PIX_FMT_BGRA:
1349             dest[0] = B >> 22;
1350             dest[1] = G >> 22;
1351             dest[2] = R >> 22;
1352             dest[3] = hasAlpha ? A : 255;
1353             break;
1354         }
1355         dest += step;
1356     }
1357 }
1358
1359 #if CONFIG_SMALL
1360 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1361 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1362 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1363 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1364 #else
1365 #if CONFIG_SWSCALE_ALPHA
1366 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1367 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1368 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1369 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1370 #endif
1371 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1372 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1373 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1374 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1375 #endif
1376 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1377 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1378
1379 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1380                                        int width, int height,
1381                                        int y, uint8_t val)
1382 {
1383     int i;
1384     uint8_t *ptr = plane + stride*y;
1385     for (i=0; i<height; i++) {
1386         memset(ptr, val, width);
1387         ptr += stride;
1388     }
1389 }
1390
1391 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1392
1393 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1394 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1395
1396 static av_always_inline void
1397 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1398                     enum PixelFormat origin)
1399 {
1400     int i;
1401     for (i = 0; i < width; i++) {
1402         unsigned int r_b = input_pixel(&src[i*3+0]);
1403         unsigned int   g = input_pixel(&src[i*3+1]);
1404         unsigned int b_r = input_pixel(&src[i*3+2]);
1405
1406         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1407     }
1408 }
1409
1410 static av_always_inline void
1411 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1412                     const uint16_t *src1, const uint16_t *src2,
1413                     int width, enum PixelFormat origin)
1414 {
1415     int i;
1416     assert(src1==src2);
1417     for (i = 0; i < width; i++) {
1418         int r_b = input_pixel(&src1[i*3+0]);
1419         int   g = input_pixel(&src1[i*3+1]);
1420         int b_r = input_pixel(&src1[i*3+2]);
1421
1422         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1423         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1424     }
1425 }
1426
1427 static av_always_inline void
1428 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1429                           const uint16_t *src1, const uint16_t *src2,
1430                           int width, enum PixelFormat origin)
1431 {
1432     int i;
1433     assert(src1==src2);
1434     for (i = 0; i < width; i++) {
1435         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1436         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1437         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1438
1439         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1440         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1441     }
1442 }
1443
1444 #undef r
1445 #undef b
1446 #undef input_pixel
1447
1448 #define rgb48funcs(pattern, BE_LE, origin) \
1449 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1450                                     int width, uint32_t *unused) \
1451 { \
1452     const uint16_t *src = (const uint16_t *) _src; \
1453     uint16_t *dst = (uint16_t *) _dst; \
1454     rgb48ToY_c_template(dst, src, width, origin); \
1455 } \
1456  \
1457 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1458                                     const uint8_t *_src1, const uint8_t *_src2, \
1459                                     int width, uint32_t *unused) \
1460 { \
1461     const uint16_t *src1 = (const uint16_t *) _src1, \
1462                    *src2 = (const uint16_t *) _src2; \
1463     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1464     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1465 } \
1466  \
1467 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1468                                     const uint8_t *_src1, const uint8_t *_src2, \
1469                                     int width, uint32_t *unused) \
1470 { \
1471     const uint16_t *src1 = (const uint16_t *) _src1, \
1472                    *src2 = (const uint16_t *) _src2; \
1473     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1474     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1475 }
1476
1477 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1478 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1479 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1480 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1481
1482 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1483                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1484                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1485
1486 static av_always_inline void
1487 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1488                        int width, enum PixelFormat origin,
1489                        int shr,   int shg,   int shb, int shp,
1490                        int maskr, int maskg, int maskb,
1491                        int rsh,   int gsh,   int bsh, int S)
1492 {
1493     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1494               rnd = 33 << (S - 1);
1495     int i;
1496
1497     for (i = 0; i < width; i++) {
1498         int px = input_pixel(i) >> shp;
1499         int b = (px & maskb) >> shb;
1500         int g = (px & maskg) >> shg;
1501         int r = (px & maskr) >> shr;
1502
1503         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1504     }
1505 }
1506
1507 static av_always_inline void
1508 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1509                         const uint8_t *src, int width,
1510                         enum PixelFormat origin,
1511                         int shr,   int shg,   int shb, int shp,
1512                         int maskr, int maskg, int maskb,
1513                         int rsh,   int gsh,   int bsh, int S)
1514 {
1515     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1516               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1517               rnd = 257 << (S - 1);
1518     int i;
1519
1520     for (i = 0; i < width; i++) {
1521         int px = input_pixel(i) >> shp;
1522         int b = (px & maskb) >> shb;
1523         int g = (px & maskg) >> shg;
1524         int r = (px & maskr) >> shr;
1525
1526         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1527         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1528     }
1529 }
1530
1531 static av_always_inline void
1532 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1533                              const uint8_t *src, int width,
1534                              enum PixelFormat origin,
1535                              int shr,   int shg,   int shb, int shp,
1536                              int maskr, int maskg, int maskb,
1537                              int rsh,   int gsh,   int bsh, int S)
1538 {
1539     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1540               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1541               rnd = 257 << S, maskgx = ~(maskr | maskb);
1542     int i;
1543
1544     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1545     for (i = 0; i < width; i++) {
1546         int px0 = input_pixel(2 * i + 0) >> shp;
1547         int px1 = input_pixel(2 * i + 1) >> shp;
1548         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1549         int rb = px0 + px1 - g;
1550
1551         b = (rb & maskb) >> shb;
1552         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1553             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1554             g >>= shg;
1555         } else {
1556             g = (g  & maskg) >> shg;
1557         }
1558         r = (rb & maskr) >> shr;
1559
1560         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1561         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1562     }
1563 }
1564
1565 #undef input_pixel
1566
1567 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1568                          maskg, maskb, rsh, gsh, bsh, S) \
1569 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1570                           int width, uint32_t *unused) \
1571 { \
1572     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1573                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1574 } \
1575  \
1576 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1577                            const uint8_t *src, const uint8_t *dummy, \
1578                            int width, uint32_t *unused) \
1579 { \
1580     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1581                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1582 } \
1583  \
1584 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1585                                 const uint8_t *src, const uint8_t *dummy, \
1586                                 int width, uint32_t *unused) \
1587 { \
1588     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1589                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1590 }
1591
1592 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1593 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1594 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1595 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1596 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1597 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1598 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1599 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1600 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1601 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1602 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1603 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1604
1605 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1606 {
1607     int i;
1608     for (i=0; i<width; i++) {
1609         dst[i]= src[4*i];
1610     }
1611 }
1612
1613 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1614 {
1615     int i;
1616     for (i=0; i<width; i++) {
1617         dst[i]= src[4*i+3];
1618     }
1619 }
1620
1621 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1622 {
1623     int i;
1624     for (i=0; i<width; i++) {
1625         int d= src[i];
1626
1627         dst[i]= pal[d] & 0xFF;
1628     }
1629 }
1630
1631 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1632                       const uint8_t *src1, const uint8_t *src2,
1633                       int width, uint32_t *pal)
1634 {
1635     int i;
1636     assert(src1 == src2);
1637     for (i=0; i<width; i++) {
1638         int p= pal[src1[i]];
1639
1640         dstU[i]= p>>8;
1641         dstV[i]= p>>16;
1642     }
1643 }
1644
1645 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1646                           int width, uint32_t *unused)
1647 {
1648     int i, j;
1649     for (i=0; i<width/8; i++) {
1650         int d= ~src[i];
1651         for(j=0; j<8; j++)
1652             dst[8*i+j]= ((d>>(7-j))&1)*255;
1653     }
1654 }
1655
1656 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1657                           int width, uint32_t *unused)
1658 {
1659     int i, j;
1660     for (i=0; i<width/8; i++) {
1661         int d= src[i];
1662         for(j=0; j<8; j++)
1663             dst[8*i+j]= ((d>>(7-j))&1)*255;
1664     }
1665 }
1666
1667 //FIXME yuy2* can read up to 7 samples too much
1668
1669 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1670                       uint32_t *unused)
1671 {
1672     int i;
1673     for (i=0; i<width; i++)
1674         dst[i]= src[2*i];
1675 }
1676
1677 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1678                        const uint8_t *src2, int width, uint32_t *unused)
1679 {
1680     int i;
1681     for (i=0; i<width; i++) {
1682         dstU[i]= src1[4*i + 1];
1683         dstV[i]= src1[4*i + 3];
1684     }
1685     assert(src1 == src2);
1686 }
1687
1688 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1689 {
1690     int i;
1691     const uint16_t *src = (const uint16_t *) _src;
1692     uint16_t *dst = (uint16_t *) _dst;
1693     for (i=0; i<width; i++) {
1694         dst[i] = av_bswap16(src[i]);
1695     }
1696 }
1697
1698 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1699                         const uint8_t *_src2, int width, uint32_t *unused)
1700 {
1701     int i;
1702     const uint16_t *src1 = (const uint16_t *) _src1,
1703                    *src2 = (const uint16_t *) _src2;
1704     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1705     for (i=0; i<width; i++) {
1706         dstU[i] = av_bswap16(src1[i]);
1707         dstV[i] = av_bswap16(src2[i]);
1708     }
1709 }
1710
1711 /* This is almost identical to the previous, end exists only because
1712  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1713 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1714                       uint32_t *unused)
1715 {
1716     int i;
1717     for (i=0; i<width; i++)
1718         dst[i]= src[2*i+1];
1719 }
1720
1721 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1722                        const uint8_t *src2, int width, uint32_t *unused)
1723 {
1724     int i;
1725     for (i=0; i<width; i++) {
1726         dstU[i]= src1[4*i + 0];
1727         dstV[i]= src1[4*i + 2];
1728     }
1729     assert(src1 == src2);
1730 }
1731
1732 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1733                                         const uint8_t *src, int width)
1734 {
1735     int i;
1736     for (i = 0; i < width; i++) {
1737         dst1[i] = src[2*i+0];
1738         dst2[i] = src[2*i+1];
1739     }
1740 }
1741
1742 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1743                        const uint8_t *src1, const uint8_t *src2,
1744                        int width, uint32_t *unused)
1745 {
1746     nvXXtoUV_c(dstU, dstV, src1, width);
1747 }
1748
1749 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1750                        const uint8_t *src1, const uint8_t *src2,
1751                        int width, uint32_t *unused)
1752 {
1753     nvXXtoUV_c(dstV, dstU, src1, width);
1754 }
1755
1756 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1757
1758 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1759                        int width, uint32_t *unused)
1760 {
1761     int i;
1762     for (i=0; i<width; i++) {
1763         int b= src[i*3+0];
1764         int g= src[i*3+1];
1765         int r= src[i*3+2];
1766
1767         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1768     }
1769 }
1770
1771 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1772                         const uint8_t *src2, int width, uint32_t *unused)
1773 {
1774     int i;
1775     for (i=0; i<width; i++) {
1776         int b= src1[3*i + 0];
1777         int g= src1[3*i + 1];
1778         int r= src1[3*i + 2];
1779
1780         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1781         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1782     }
1783     assert(src1 == src2);
1784 }
1785
1786 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1787                              const uint8_t *src2, int width, uint32_t *unused)
1788 {
1789     int i;
1790     for (i=0; i<width; i++) {
1791         int b= src1[6*i + 0] + src1[6*i + 3];
1792         int g= src1[6*i + 1] + src1[6*i + 4];
1793         int r= src1[6*i + 2] + src1[6*i + 5];
1794
1795         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1796         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1797     }
1798     assert(src1 == src2);
1799 }
1800
1801 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1802                        uint32_t *unused)
1803 {
1804     int i;
1805     for (i=0; i<width; i++) {
1806         int r= src[i*3+0];
1807         int g= src[i*3+1];
1808         int b= src[i*3+2];
1809
1810         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1811     }
1812 }
1813
1814 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1815                         const uint8_t *src2, int width, uint32_t *unused)
1816 {
1817     int i;
1818     assert(src1==src2);
1819     for (i=0; i<width; i++) {
1820         int r= src1[3*i + 0];
1821         int g= src1[3*i + 1];
1822         int b= src1[3*i + 2];
1823
1824         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1825         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1826     }
1827 }
1828
1829 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1830                              const uint8_t *src2, int width, uint32_t *unused)
1831 {
1832     int i;
1833     assert(src1==src2);
1834     for (i=0; i<width; i++) {
1835         int r= src1[6*i + 0] + src1[6*i + 3];
1836         int g= src1[6*i + 1] + src1[6*i + 4];
1837         int b= src1[6*i + 2] + src1[6*i + 5];
1838
1839         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1840         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1841     }
1842 }
1843
1844 static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
1845                        const int16_t *filter,
1846                        const int16_t *filterPos, int filterSize)
1847 {
1848     int i;
1849     int32_t *dst = (int32_t *) _dst;
1850     const uint16_t *src = (const uint16_t *) _src;
1851     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1852     int sh = (bits <= 7) ? 11 : (bits - 4);
1853
1854     for (i = 0; i < dstW; i++) {
1855         int j;
1856         int srcPos = filterPos[i];
1857         unsigned int val = 0;
1858
1859         for (j = 0; j < filterSize; j++) {
1860             val += src[srcPos + j] * filter[filterSize * i + j];
1861         }
1862         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1863         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1864     }
1865 }
1866
1867 // bilinear / bicubic scaling
1868 static void hScale_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
1869                      const int16_t *filter, const int16_t *filterPos,
1870                      int filterSize)
1871 {
1872     int i;
1873     for (i=0; i<dstW; i++) {
1874         int j;
1875         int srcPos= filterPos[i];
1876         int val=0;
1877         for (j=0; j<filterSize; j++) {
1878             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1879         }
1880         //filter += hFilterSize;
1881         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1882         //dst[i] = val>>7;
1883     }
1884 }
1885
1886 //FIXME all pal and rgb srcFormats could do this convertion as well
1887 //FIXME all scalers more complex than bilinear could do half of this transform
1888 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1889 {
1890     int i;
1891     for (i = 0; i < width; i++) {
1892         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1893         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1894     }
1895 }
1896 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1897 {
1898     int i;
1899     for (i = 0; i < width; i++) {
1900         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1901         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1902     }
1903 }
1904 static void lumRangeToJpeg_c(int16_t *dst, int width)
1905 {
1906     int i;
1907     for (i = 0; i < width; i++)
1908         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1909 }
1910 static void lumRangeFromJpeg_c(int16_t *dst, int width)
1911 {
1912     int i;
1913     for (i = 0; i < width; i++)
1914         dst[i] = (dst[i]*14071 + 33561947)>>14;
1915 }
1916
1917 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
1918 {
1919     int i;
1920     int32_t *dstU = (int32_t *) _dstU;
1921     int32_t *dstV = (int32_t *) _dstV;
1922     for (i = 0; i < width; i++) {
1923         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
1924         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
1925     }
1926 }
1927 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
1928 {
1929     int i;
1930     int32_t *dstU = (int32_t *) _dstU;
1931     int32_t *dstV = (int32_t *) _dstV;
1932     for (i = 0; i < width; i++) {
1933         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
1934         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
1935     }
1936 }
1937 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
1938 {
1939     int i;
1940     int32_t *dst = (int32_t *) _dst;
1941     for (i = 0; i < width; i++)
1942         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
1943 }
1944 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
1945 {
1946     int i;
1947     int32_t *dst = (int32_t *) _dst;
1948     for (i = 0; i < width; i++)
1949         dst[i] = (dst[i]*14071 + (33561947<<4))>>14;
1950 }
1951
1952 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1953                            const uint8_t *src, int srcW, int xInc)
1954 {
1955     int i;
1956     unsigned int xpos=0;
1957     for (i=0;i<dstWidth;i++) {
1958         register unsigned int xx=xpos>>16;
1959         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1960         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1961         xpos+=xInc;
1962     }
1963 }
1964
1965 static void scale8To16Rv_c(uint16_t *_dst, const uint8_t *src, int len)
1966 {
1967     int i;
1968     uint8_t *dst = (uint8_t *) _dst;
1969     for (i = len - 1; i >= 0; i--) {
1970         dst[i * 2] = dst[i * 2 + 1] = src[i];
1971     }
1972 }
1973
1974 static void scale19To15Fw_c(int16_t *dst, const int32_t *src, int len)
1975 {
1976     int i;
1977     for (i = 0; i < len; i++) {
1978         dst[i] = src[i] >> 4;
1979     }
1980 }
1981
1982 // *** horizontal scale Y line to temp buffer
1983 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
1984                                      const uint8_t *src, int srcW, int xInc,
1985                                      const int16_t *hLumFilter,
1986                                      const int16_t *hLumFilterPos, int hLumFilterSize,
1987                                      uint8_t *formatConvBuffer,
1988                                      uint32_t *pal, int isAlpha)
1989 {
1990     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1991     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1992
1993     if (toYV12) {
1994         toYV12(formatConvBuffer, src, srcW, pal);
1995         src= formatConvBuffer;
1996     }
1997
1998     if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
1999         c->scale8To16Rv((uint16_t *) formatConvBuffer, src, srcW);
2000         src = formatConvBuffer;
2001     }
2002
2003     if (!c->hyscale_fast) {
2004         c->hScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2005     } else { // fast bilinear upscale / crap downscale
2006         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2007     }
2008
2009     if (convertRange)
2010         convertRange(dst, dstWidth);
2011
2012     if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2013         c->scale19To15Fw(dst, (int32_t *) dst, dstWidth);
2014     }
2015 }
2016
2017 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2018                            int dstWidth, const uint8_t *src1,
2019                            const uint8_t *src2, int srcW, int xInc)
2020 {
2021     int i;
2022     unsigned int xpos=0;
2023     for (i=0;i<dstWidth;i++) {
2024         register unsigned int xx=xpos>>16;
2025         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2026         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2027         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2028         xpos+=xInc;
2029     }
2030 }
2031
2032 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2033                                      const uint8_t *src1, const uint8_t *src2,
2034                                      int srcW, int xInc, const int16_t *hChrFilter,
2035                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2036                                      uint8_t *formatConvBuffer, uint32_t *pal)
2037 {
2038     if (c->chrToYV12) {
2039         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW * c->scalingBpp >> 3, 16);
2040         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2041         src1= formatConvBuffer;
2042         src2= buf2;
2043     }
2044
2045     if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2046         uint8_t *buf2 = (formatConvBuffer + FFALIGN(srcW * 2, 16));
2047         c->scale8To16Rv((uint16_t *) formatConvBuffer, src1, srcW);
2048         c->scale8To16Rv((uint16_t *) buf2,             src2, srcW);
2049         src1 = formatConvBuffer;
2050         src2 = buf2;
2051     }
2052
2053     if (!c->hcscale_fast) {
2054         c->hScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2055         c->hScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2056     } else { // fast bilinear upscale / crap downscale
2057         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2058     }
2059
2060     if (c->chrConvertRange)
2061         c->chrConvertRange(dst1, dst2, dstWidth);
2062
2063     if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16) {
2064         c->scale19To15Fw(dst1, (int32_t *) dst1, dstWidth);
2065         c->scale19To15Fw(dst2, (int32_t *) dst2, dstWidth);
2066     }
2067 }
2068
2069 static av_always_inline void
2070 find_c_packed_planar_out_funcs(SwsContext *c,
2071                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2072                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2073                                yuv2packedX_fn *yuv2packedX)
2074 {
2075     enum PixelFormat dstFormat = c->dstFormat;
2076
2077     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2078         *yuv2yuvX     = yuv2nv12X_c;
2079     } else if (is16BPS(dstFormat)) {
2080         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2081     } else if (is9_OR_10BPS(dstFormat)) {
2082         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2083             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2084         } else {
2085             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2086         }
2087     } else {
2088         *yuv2yuv1     = yuv2yuv1_c;
2089         *yuv2yuvX     = yuv2yuvX_c;
2090     }
2091     if(c->flags & SWS_FULL_CHR_H_INT) {
2092         switch (dstFormat) {
2093             case PIX_FMT_RGBA:
2094 #if CONFIG_SMALL
2095                 *yuv2packedX = yuv2rgba32_full_X_c;
2096 #else
2097 #if CONFIG_SWSCALE_ALPHA
2098                 if (c->alpPixBuf) {
2099                     *yuv2packedX = yuv2rgba32_full_X_c;
2100                 } else
2101 #endif /* CONFIG_SWSCALE_ALPHA */
2102                 {
2103                     *yuv2packedX = yuv2rgbx32_full_X_c;
2104                 }
2105 #endif /* !CONFIG_SMALL */
2106                 break;
2107             case PIX_FMT_ARGB:
2108 #if CONFIG_SMALL
2109                 *yuv2packedX = yuv2argb32_full_X_c;
2110 #else
2111 #if CONFIG_SWSCALE_ALPHA
2112                 if (c->alpPixBuf) {
2113                     *yuv2packedX = yuv2argb32_full_X_c;
2114                 } else
2115 #endif /* CONFIG_SWSCALE_ALPHA */
2116                 {
2117                     *yuv2packedX = yuv2xrgb32_full_X_c;
2118                 }
2119 #endif /* !CONFIG_SMALL */
2120                 break;
2121             case PIX_FMT_BGRA:
2122 #if CONFIG_SMALL
2123                 *yuv2packedX = yuv2bgra32_full_X_c;
2124 #else
2125 #if CONFIG_SWSCALE_ALPHA
2126                 if (c->alpPixBuf) {
2127                     *yuv2packedX = yuv2bgra32_full_X_c;
2128                 } else
2129 #endif /* CONFIG_SWSCALE_ALPHA */
2130                 {
2131                     *yuv2packedX = yuv2bgrx32_full_X_c;
2132                 }
2133 #endif /* !CONFIG_SMALL */
2134                 break;
2135             case PIX_FMT_ABGR:
2136 #if CONFIG_SMALL
2137                 *yuv2packedX = yuv2abgr32_full_X_c;
2138 #else
2139 #if CONFIG_SWSCALE_ALPHA
2140                 if (c->alpPixBuf) {
2141                     *yuv2packedX = yuv2abgr32_full_X_c;
2142                 } else
2143 #endif /* CONFIG_SWSCALE_ALPHA */
2144                 {
2145                     *yuv2packedX = yuv2xbgr32_full_X_c;
2146                 }
2147 #endif /* !CONFIG_SMALL */
2148                 break;
2149             case PIX_FMT_RGB24:
2150             *yuv2packedX = yuv2rgb24_full_X_c;
2151             break;
2152         case PIX_FMT_BGR24:
2153             *yuv2packedX = yuv2bgr24_full_X_c;
2154             break;
2155         }
2156     } else {
2157         switch (dstFormat) {
2158         case PIX_FMT_GRAY16BE:
2159             *yuv2packed1 = yuv2gray16BE_1_c;
2160             *yuv2packed2 = yuv2gray16BE_2_c;
2161             *yuv2packedX = yuv2gray16BE_X_c;
2162             break;
2163         case PIX_FMT_GRAY16LE:
2164             *yuv2packed1 = yuv2gray16LE_1_c;
2165             *yuv2packed2 = yuv2gray16LE_2_c;
2166             *yuv2packedX = yuv2gray16LE_X_c;
2167             break;
2168         case PIX_FMT_MONOWHITE:
2169             *yuv2packed1 = yuv2monowhite_1_c;
2170             *yuv2packed2 = yuv2monowhite_2_c;
2171             *yuv2packedX = yuv2monowhite_X_c;
2172             break;
2173         case PIX_FMT_MONOBLACK:
2174             *yuv2packed1 = yuv2monoblack_1_c;
2175             *yuv2packed2 = yuv2monoblack_2_c;
2176             *yuv2packedX = yuv2monoblack_X_c;
2177             break;
2178         case PIX_FMT_YUYV422:
2179             *yuv2packed1 = yuv2yuyv422_1_c;
2180             *yuv2packed2 = yuv2yuyv422_2_c;
2181             *yuv2packedX = yuv2yuyv422_X_c;
2182             break;
2183         case PIX_FMT_UYVY422:
2184             *yuv2packed1 = yuv2uyvy422_1_c;
2185             *yuv2packed2 = yuv2uyvy422_2_c;
2186             *yuv2packedX = yuv2uyvy422_X_c;
2187             break;
2188         case PIX_FMT_RGB48LE:
2189             *yuv2packed1 = yuv2rgb48le_1_c;
2190             *yuv2packed2 = yuv2rgb48le_2_c;
2191             *yuv2packedX = yuv2rgb48le_X_c;
2192             break;
2193         case PIX_FMT_RGB48BE:
2194             *yuv2packed1 = yuv2rgb48be_1_c;
2195             *yuv2packed2 = yuv2rgb48be_2_c;
2196             *yuv2packedX = yuv2rgb48be_X_c;
2197             break;
2198         case PIX_FMT_BGR48LE:
2199             *yuv2packed1 = yuv2bgr48le_1_c;
2200             *yuv2packed2 = yuv2bgr48le_2_c;
2201             *yuv2packedX = yuv2bgr48le_X_c;
2202             break;
2203         case PIX_FMT_BGR48BE:
2204             *yuv2packed1 = yuv2bgr48be_1_c;
2205             *yuv2packed2 = yuv2bgr48be_2_c;
2206             *yuv2packedX = yuv2bgr48be_X_c;
2207             break;
2208         case PIX_FMT_RGB32:
2209         case PIX_FMT_BGR32:
2210 #if CONFIG_SMALL
2211             *yuv2packed1 = yuv2rgb32_1_c;
2212             *yuv2packed2 = yuv2rgb32_2_c;
2213             *yuv2packedX = yuv2rgb32_X_c;
2214 #else
2215 #if CONFIG_SWSCALE_ALPHA
2216                 if (c->alpPixBuf) {
2217                     *yuv2packed1 = yuv2rgba32_1_c;
2218                     *yuv2packed2 = yuv2rgba32_2_c;
2219                     *yuv2packedX = yuv2rgba32_X_c;
2220                 } else
2221 #endif /* CONFIG_SWSCALE_ALPHA */
2222                 {
2223                     *yuv2packed1 = yuv2rgbx32_1_c;
2224                     *yuv2packed2 = yuv2rgbx32_2_c;
2225                     *yuv2packedX = yuv2rgbx32_X_c;
2226                 }
2227 #endif /* !CONFIG_SMALL */
2228             break;
2229         case PIX_FMT_RGB32_1:
2230         case PIX_FMT_BGR32_1:
2231 #if CONFIG_SMALL
2232                 *yuv2packed1 = yuv2rgb32_1_1_c;
2233                 *yuv2packed2 = yuv2rgb32_1_2_c;
2234                 *yuv2packedX = yuv2rgb32_1_X_c;
2235 #else
2236 #if CONFIG_SWSCALE_ALPHA
2237                 if (c->alpPixBuf) {
2238                     *yuv2packed1 = yuv2rgba32_1_1_c;
2239                     *yuv2packed2 = yuv2rgba32_1_2_c;
2240                     *yuv2packedX = yuv2rgba32_1_X_c;
2241                 } else
2242 #endif /* CONFIG_SWSCALE_ALPHA */
2243                 {
2244                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2245                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2246                     *yuv2packedX = yuv2rgbx32_1_X_c;
2247                 }
2248 #endif /* !CONFIG_SMALL */
2249                 break;
2250         case PIX_FMT_RGB24:
2251             *yuv2packed1 = yuv2rgb24_1_c;
2252             *yuv2packed2 = yuv2rgb24_2_c;
2253             *yuv2packedX = yuv2rgb24_X_c;
2254             break;
2255         case PIX_FMT_BGR24:
2256             *yuv2packed1 = yuv2bgr24_1_c;
2257             *yuv2packed2 = yuv2bgr24_2_c;
2258             *yuv2packedX = yuv2bgr24_X_c;
2259             break;
2260         case PIX_FMT_RGB565LE:
2261         case PIX_FMT_RGB565BE:
2262         case PIX_FMT_BGR565LE:
2263         case PIX_FMT_BGR565BE:
2264             *yuv2packed1 = yuv2rgb16_1_c;
2265             *yuv2packed2 = yuv2rgb16_2_c;
2266             *yuv2packedX = yuv2rgb16_X_c;
2267             break;
2268         case PIX_FMT_RGB555LE:
2269         case PIX_FMT_RGB555BE:
2270         case PIX_FMT_BGR555LE:
2271         case PIX_FMT_BGR555BE:
2272             *yuv2packed1 = yuv2rgb15_1_c;
2273             *yuv2packed2 = yuv2rgb15_2_c;
2274             *yuv2packedX = yuv2rgb15_X_c;
2275             break;
2276         case PIX_FMT_RGB444LE:
2277         case PIX_FMT_RGB444BE:
2278         case PIX_FMT_BGR444LE:
2279         case PIX_FMT_BGR444BE:
2280             *yuv2packed1 = yuv2rgb12_1_c;
2281             *yuv2packed2 = yuv2rgb12_2_c;
2282             *yuv2packedX = yuv2rgb12_X_c;
2283             break;
2284         case PIX_FMT_RGB8:
2285         case PIX_FMT_BGR8:
2286             *yuv2packed1 = yuv2rgb8_1_c;
2287             *yuv2packed2 = yuv2rgb8_2_c;
2288             *yuv2packedX = yuv2rgb8_X_c;
2289             break;
2290         case PIX_FMT_RGB4:
2291         case PIX_FMT_BGR4:
2292             *yuv2packed1 = yuv2rgb4_1_c;
2293             *yuv2packed2 = yuv2rgb4_2_c;
2294             *yuv2packedX = yuv2rgb4_X_c;
2295             break;
2296         case PIX_FMT_RGB4_BYTE:
2297         case PIX_FMT_BGR4_BYTE:
2298             *yuv2packed1 = yuv2rgb4b_1_c;
2299             *yuv2packed2 = yuv2rgb4b_2_c;
2300             *yuv2packedX = yuv2rgb4b_X_c;
2301             break;
2302         }
2303     }
2304 }
2305
2306 #define DEBUG_SWSCALE_BUFFERS 0
2307 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2308
2309 static int swScale(SwsContext *c, const uint8_t* src[],
2310                    int srcStride[], int srcSliceY,
2311                    int srcSliceH, uint8_t* dst[], int dstStride[])
2312 {
2313     /* load a few things into local vars to make the code more readable? and faster */
2314     const int srcW= c->srcW;
2315     const int dstW= c->dstW;
2316     const int dstH= c->dstH;
2317     const int chrDstW= c->chrDstW;
2318     const int chrSrcW= c->chrSrcW;
2319     const int lumXInc= c->lumXInc;
2320     const int chrXInc= c->chrXInc;
2321     const enum PixelFormat dstFormat= c->dstFormat;
2322     const int flags= c->flags;
2323     int16_t *vLumFilterPos= c->vLumFilterPos;
2324     int16_t *vChrFilterPos= c->vChrFilterPos;
2325     int16_t *hLumFilterPos= c->hLumFilterPos;
2326     int16_t *hChrFilterPos= c->hChrFilterPos;
2327     int16_t *vLumFilter= c->vLumFilter;
2328     int16_t *vChrFilter= c->vChrFilter;
2329     int16_t *hLumFilter= c->hLumFilter;
2330     int16_t *hChrFilter= c->hChrFilter;
2331     int32_t *lumMmxFilter= c->lumMmxFilter;
2332     int32_t *chrMmxFilter= c->chrMmxFilter;
2333     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2334     const int vLumFilterSize= c->vLumFilterSize;
2335     const int vChrFilterSize= c->vChrFilterSize;
2336     const int hLumFilterSize= c->hLumFilterSize;
2337     const int hChrFilterSize= c->hChrFilterSize;
2338     int16_t **lumPixBuf= c->lumPixBuf;
2339     int16_t **chrUPixBuf= c->chrUPixBuf;
2340     int16_t **chrVPixBuf= c->chrVPixBuf;
2341     int16_t **alpPixBuf= c->alpPixBuf;
2342     const int vLumBufSize= c->vLumBufSize;
2343     const int vChrBufSize= c->vChrBufSize;
2344     uint8_t *formatConvBuffer= c->formatConvBuffer;
2345     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2346     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2347     int lastDstY;
2348     uint32_t *pal=c->pal_yuv;
2349     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2350     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2351     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2352     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2353     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2354
2355     /* vars which will change and which we need to store back in the context */
2356     int dstY= c->dstY;
2357     int lumBufIndex= c->lumBufIndex;
2358     int chrBufIndex= c->chrBufIndex;
2359     int lastInLumBuf= c->lastInLumBuf;
2360     int lastInChrBuf= c->lastInChrBuf;
2361
2362     if (isPacked(c->srcFormat)) {
2363         src[0]=
2364         src[1]=
2365         src[2]=
2366         src[3]= src[0];
2367         srcStride[0]=
2368         srcStride[1]=
2369         srcStride[2]=
2370         srcStride[3]= srcStride[0];
2371     }
2372     srcStride[1]<<= c->vChrDrop;
2373     srcStride[2]<<= c->vChrDrop;
2374
2375     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2376                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2377                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2378     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2379                    srcSliceY,    srcSliceH,    dstY,    dstH);
2380     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2381                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2382
2383     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2384         static int warnedAlready=0; //FIXME move this into the context perhaps
2385         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2386             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2387                    "         ->cannot do aligned memory accesses anymore\n");
2388             warnedAlready=1;
2389         }
2390     }
2391
2392     /* Note the user might start scaling the picture in the middle so this
2393        will not get executed. This is not really intended but works
2394        currently, so people might do it. */
2395     if (srcSliceY ==0) {
2396         lumBufIndex=-1;
2397         chrBufIndex=-1;
2398         dstY=0;
2399         lastInLumBuf= -1;
2400         lastInChrBuf= -1;
2401     }
2402
2403     lastDstY= dstY;
2404
2405     for (;dstY < dstH; dstY++) {
2406         const int chrDstY= dstY>>c->chrDstVSubSample;
2407         uint8_t *dest[4] = {
2408             dst[0] + dstStride[0] * dstY,
2409             dst[1] + dstStride[1] * chrDstY,
2410             dst[2] + dstStride[2] * chrDstY,
2411             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2412         };
2413
2414         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2415         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2416         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2417         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2418         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2419         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2420         int enough_lines;
2421
2422         //handle holes (FAST_BILINEAR & weird filters)
2423         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2424         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2425         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2426         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2427
2428         DEBUG_BUFFERS("dstY: %d\n", dstY);
2429         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2430                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2431         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2432                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2433
2434         // Do we have enough lines in this slice to output the dstY line
2435         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2436
2437         if (!enough_lines) {
2438             lastLumSrcY = srcSliceY + srcSliceH - 1;
2439             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2440             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2441                                             lastLumSrcY, lastChrSrcY);
2442         }
2443
2444         //Do horizontal scaling
2445         while(lastInLumBuf < lastLumSrcY) {
2446             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2447             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2448             lumBufIndex++;
2449             assert(lumBufIndex < 2*vLumBufSize);
2450             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2451             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2452             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2453                     hLumFilter, hLumFilterPos, hLumFilterSize,
2454                     formatConvBuffer,
2455                     pal, 0);
2456             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2457                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2458                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2459                         formatConvBuffer,
2460                         pal, 1);
2461             lastInLumBuf++;
2462             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2463                                lumBufIndex,    lastInLumBuf);
2464         }
2465         while(lastInChrBuf < lastChrSrcY) {
2466             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2467             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2468             chrBufIndex++;
2469             assert(chrBufIndex < 2*vChrBufSize);
2470             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2471             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2472             //FIXME replace parameters through context struct (some at least)
2473
2474             if (c->needs_hcscale)
2475                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2476                           chrDstW, src1, src2, chrSrcW, chrXInc,
2477                           hChrFilter, hChrFilterPos, hChrFilterSize,
2478                           formatConvBuffer, pal);
2479             lastInChrBuf++;
2480             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2481                                chrBufIndex,    lastInChrBuf);
2482         }
2483         //wrap buf index around to stay inside the ring buffer
2484         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2485         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2486         if (!enough_lines)
2487             break; //we can't output a dstY line so let's try with the next slice
2488
2489 #if HAVE_MMX
2490         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2491 #endif
2492         if (dstY >= dstH-2) {
2493             // hmm looks like we can't use MMX here without overwriting this array's tail
2494             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2495                                            &yuv2packed1, &yuv2packed2,
2496                                            &yuv2packedX);
2497         }
2498
2499         {
2500             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2501             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2502             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2503             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2504             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2505                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2506                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2507                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2508                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2509                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2510                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2511                              dest, dstW, chrDstW);
2512                 } else { //General YV12
2513                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2514                              lumSrcPtr, vLumFilterSize,
2515                              vChrFilter + chrDstY * vChrFilterSize,
2516                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2517                              alpSrcPtr, dest, dstW, chrDstW);
2518                 }
2519             } else {
2520                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2521                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2522                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2523                     int chrAlpha = vChrFilter[2 * dstY + 1];
2524                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2525                                 alpPixBuf ? *alpSrcPtr : NULL,
2526                                 dest[0], dstW, chrAlpha, dstY);
2527                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2528                     int lumAlpha = vLumFilter[2 * dstY + 1];
2529                     int chrAlpha = vChrFilter[2 * dstY + 1];
2530                     lumMmxFilter[2] =
2531                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2532                     chrMmxFilter[2] =
2533                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2534                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2535                                 alpPixBuf ? alpSrcPtr : NULL,
2536                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2537                 } else { //general RGB
2538                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2539                                 lumSrcPtr, vLumFilterSize,
2540                                 vChrFilter + dstY * vChrFilterSize,
2541                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2542                                 alpSrcPtr, dest[0], dstW, dstY);
2543                 }
2544             }
2545         }
2546     }
2547
2548     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2549         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2550
2551 #if HAVE_MMX2
2552     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2553         __asm__ volatile("sfence":::"memory");
2554 #endif
2555     emms_c();
2556
2557     /* store changed local vars back in the context */
2558     c->dstY= dstY;
2559     c->lumBufIndex= lumBufIndex;
2560     c->chrBufIndex= chrBufIndex;
2561     c->lastInLumBuf= lastInLumBuf;
2562     c->lastInChrBuf= lastInChrBuf;
2563
2564     return dstY - lastDstY;
2565 }
2566
2567 static av_cold void sws_init_swScale_c(SwsContext *c)
2568 {
2569     enum PixelFormat srcFormat = c->srcFormat;
2570
2571     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2572                                    &c->yuv2packed1, &c->yuv2packed2,
2573                                    &c->yuv2packedX);
2574
2575     c->chrToYV12 = NULL;
2576     switch(srcFormat) {
2577         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2578         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2579         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2580         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2581         case PIX_FMT_RGB8     :
2582         case PIX_FMT_BGR8     :
2583         case PIX_FMT_PAL8     :
2584         case PIX_FMT_BGR4_BYTE:
2585         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2586 #if HAVE_BIGENDIAN
2587         case PIX_FMT_YUV444P9LE:
2588         case PIX_FMT_YUV420P9LE:
2589         case PIX_FMT_YUV422P10LE:
2590         case PIX_FMT_YUV444P10LE:
2591         case PIX_FMT_YUV420P10LE:
2592         case PIX_FMT_YUV420P16LE:
2593         case PIX_FMT_YUV422P16LE:
2594         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2595 #else
2596         case PIX_FMT_YUV444P9BE:
2597         case PIX_FMT_YUV420P9BE:
2598         case PIX_FMT_YUV444P10BE:
2599         case PIX_FMT_YUV422P10BE:
2600         case PIX_FMT_YUV420P10BE:
2601         case PIX_FMT_YUV420P16BE:
2602         case PIX_FMT_YUV422P16BE:
2603         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2604 #endif
2605     }
2606     if (c->chrSrcHSubSample) {
2607         switch(srcFormat) {
2608         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2609         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2610         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2611         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2612         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2613         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2614         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2615         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2616         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2617         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2618         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2619         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2620         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2621         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2622         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2623         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2624         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2625         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2626         }
2627     } else {
2628         switch(srcFormat) {
2629         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2630         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2631         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2632         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2633         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2634         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2635         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2636         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2637         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2638         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2639         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2640         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2641         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2642         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2643         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2644         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2645         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2646         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2647         }
2648     }
2649
2650     c->lumToYV12 = NULL;
2651     c->alpToYV12 = NULL;
2652     switch (srcFormat) {
2653 #if HAVE_BIGENDIAN
2654     case PIX_FMT_YUV444P9LE:
2655     case PIX_FMT_YUV420P9LE:
2656     case PIX_FMT_YUV444P10LE:
2657     case PIX_FMT_YUV422P10LE:
2658     case PIX_FMT_YUV420P10LE:
2659     case PIX_FMT_YUV420P16LE:
2660     case PIX_FMT_YUV422P16LE:
2661     case PIX_FMT_YUV444P16LE:
2662     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2663 #else
2664     case PIX_FMT_YUV444P9BE:
2665     case PIX_FMT_YUV420P9BE:
2666     case PIX_FMT_YUV444P10BE:
2667     case PIX_FMT_YUV422P10BE:
2668     case PIX_FMT_YUV420P10BE:
2669     case PIX_FMT_YUV420P16BE:
2670     case PIX_FMT_YUV422P16BE:
2671     case PIX_FMT_YUV444P16BE:
2672     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2673 #endif
2674     case PIX_FMT_YUYV422  :
2675     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2676     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2677     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2678     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2679     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2680     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2681     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2682     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2683     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2684     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2685     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2686     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2687     case PIX_FMT_RGB8     :
2688     case PIX_FMT_BGR8     :
2689     case PIX_FMT_PAL8     :
2690     case PIX_FMT_BGR4_BYTE:
2691     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2692     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2693     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2694     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2695     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2696     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2697     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2698     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2699     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2700     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2701     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2702     }
2703     if (c->alpPixBuf) {
2704         switch (srcFormat) {
2705         case PIX_FMT_BGRA:
2706         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2707         case PIX_FMT_ABGR:
2708         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2709         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2710         }
2711     }
2712
2713     if (c->scalingBpp == 8) {
2714     c->hScale       = hScale_c;
2715     if (c->flags & SWS_FAST_BILINEAR) {
2716         c->hyscale_fast = hyscale_fast_c;
2717         c->hcscale_fast = hcscale_fast_c;
2718     }
2719
2720     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2721         if (c->srcRange) {
2722             c->lumConvertRange = lumRangeFromJpeg_c;
2723             c->chrConvertRange = chrRangeFromJpeg_c;
2724         } else {
2725             c->lumConvertRange = lumRangeToJpeg_c;
2726             c->chrConvertRange = chrRangeToJpeg_c;
2727         }
2728     }
2729     } else {
2730         c->hScale = hScale16_c;
2731         c->scale19To15Fw = scale19To15Fw_c;
2732         c->scale8To16Rv  = scale8To16Rv_c;
2733
2734         if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2735             if (c->srcRange) {
2736                 c->lumConvertRange = lumRangeFromJpeg16_c;
2737                 c->chrConvertRange = chrRangeFromJpeg16_c;
2738             } else {
2739                 c->lumConvertRange = lumRangeToJpeg16_c;
2740                 c->chrConvertRange = chrRangeToJpeg16_c;
2741             }
2742         }
2743     }
2744
2745     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2746           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2747         c->needs_hcscale = 1;
2748 }
2749
2750 SwsFunc ff_getSwsFunc(SwsContext *c)
2751 {
2752     sws_init_swScale_c(c);
2753
2754     if (HAVE_MMX)
2755         ff_sws_init_swScale_mmx(c);
2756     if (HAVE_ALTIVEC)
2757         ff_sws_init_swScale_altivec(c);
2758
2759     return swScale;
2760 }