git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 201                       int chrFilterSize, const int16_t **alpSrc,
 202                       uint16_t *dest, uint16_t *uDest, uint16_t *vDest,
 203                       uint16_t *aDest, int dstW, int chrDstW,
 204                       int big_endian, int output_bits)
 205 {
 206     //FIXME Optimize (just quickly written not optimized..)
 207     int i;
 208     int shift = 11 + 16 - output_bits;
 209
 210 #define output_pixel(pos, val) \
 211     if (big_endian) { \
 212         if (output_bits == 16) { \
 213             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 214         } else { \
 215             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 216         } \
 217     } else { \
 218         if (output_bits == 16) { \
 219             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 220         } else { \
 221             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 222         } \
 223     }
 224     for (i = 0; i < dstW; i++) {
 225         int val = 1 << (26-output_bits);
 226         int j;
 227
 228         for (j = 0; j < lumFilterSize; j++)
 229             val += lumSrc[j][i] * lumFilter[j];
 230
 231         output_pixel(&dest[i], val);
 232     }
 233
 234     if (uDest) {
 235         for (i = 0; i < chrDstW; i++) {
 236             int u = 1 << (26-output_bits);
 237             int v = 1 << (26-output_bits);
 238             int j;
 239
 240             for (j = 0; j < chrFilterSize; j++) {
 241                 u += chrUSrc[j][i] * chrFilter[j];
 242                 v += chrVSrc[j][i] * chrFilter[j];
 243             }
 244
 245             output_pixel(&uDest[i], u);
 246             output_pixel(&vDest[i], v);
 247         }
 248     }
 249
 250     if (CONFIG_SWSCALE_ALPHA && aDest) {
 251         for (i = 0; i < dstW; i++) {
 252             int val = 1 << (26-output_bits);
 253             int j;
 254
 255             for (j = 0; j < lumFilterSize; j++)
 256                 val += alpSrc[j][i] * lumFilter[j];
 257
 258             output_pixel(&aDest[i], val);
 259         }
 260     }
 261 #undef output_pixel
 262 }
 263
 264 #define yuv2NBPS(bits, BE_LE, is_be) \
 265 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 266                               const int16_t **lumSrc, int lumFilterSize, \
 267                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 268                               const int16_t **chrVSrc, \
 269                               int chrFilterSize, const int16_t **alpSrc, \
 270                               uint8_t *_dest, uint8_t *_uDest, uint8_t *_vDest, \
 271                               uint8_t *_aDest, int dstW, int chrDstW) \
 272 { \
 273     uint16_t *dest  = (uint16_t *) _dest,  *uDest = (uint16_t *) _uDest, \
 274              *vDest = (uint16_t *) _vDest, *aDest = (uint16_t *) _aDest; \
 275     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 276                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 277                           alpSrc, \
 278                           dest, uDest, vDest, aDest, \
 279                           dstW, chrDstW, is_be, bits); \
 280 }
 281 yuv2NBPS( 9, BE, 1);
 282 yuv2NBPS( 9, LE, 0);
 283 yuv2NBPS(10, BE, 1);
 284 yuv2NBPS(10, LE, 0);
 285 yuv2NBPS(16, BE, 1);
 286 yuv2NBPS(16, LE, 0);
 287
 288 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 289                        const int16_t **lumSrc, int lumFilterSize,
 290                        const int16_t *chrFilter, const int16_t **chrUSrc,
 291                        const int16_t **chrVSrc,
 292                        int chrFilterSize, const int16_t **alpSrc,
 293                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 294                        uint8_t *aDest, int dstW, int chrDstW)
 295 {
 296     //FIXME Optimize (just quickly written not optimized..)
 297     int i;
 298     for (i=0; i<dstW; i++) {
 299         int val=1<<18;
 300         int j;
 301         for (j=0; j<lumFilterSize; j++)
 302             val += lumSrc[j][i] * lumFilter[j];
 303
 304         dest[i]= av_clip_uint8(val>>19);
 305     }
 306
 307     if (uDest)
 308         for (i=0; i<chrDstW; i++) {
 309             int u=1<<18;
 310             int v=1<<18;
 311             int j;
 312             for (j=0; j<chrFilterSize; j++) {
 313                 u += chrUSrc[j][i] * chrFilter[j];
 314                 v += chrVSrc[j][i] * chrFilter[j];
 315             }
 316
 317             uDest[i]= av_clip_uint8(u>>19);
 318             vDest[i]= av_clip_uint8(v>>19);
 319         }
 320
 321     if (CONFIG_SWSCALE_ALPHA && aDest)
 322         for (i=0; i<dstW; i++) {
 323             int val=1<<18;
 324             int j;
 325             for (j=0; j<lumFilterSize; j++)
 326                 val += alpSrc[j][i] * lumFilter[j];
 327
 328             aDest[i]= av_clip_uint8(val>>19);
 329         }
 330 }
 331
 332 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 333                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 334                        const int16_t *alpSrc,
 335                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 336                        uint8_t *aDest, int dstW, int chrDstW)
 337 {
 338     int i;
 339     for (i=0; i<dstW; i++) {
 340         int val= (lumSrc[i]+64)>>7;
 341         dest[i]= av_clip_uint8(val);
 342     }
 343
 344     if (uDest)
 345         for (i=0; i<chrDstW; i++) {
 346             int u=(chrUSrc[i]+64)>>7;
 347             int v=(chrVSrc[i]+64)>>7;
 348             uDest[i]= av_clip_uint8(u);
 349             vDest[i]= av_clip_uint8(v);
 350         }
 351
 352     if (CONFIG_SWSCALE_ALPHA && aDest)
 353         for (i=0; i<dstW; i++) {
 354             int val= (alpSrc[i]+64)>>7;
 355             aDest[i]= av_clip_uint8(val);
 356         }
 357 }
 358
 359 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 360                         const int16_t **lumSrc, int lumFilterSize,
 361                         const int16_t *chrFilter, const int16_t **chrUSrc,
 362                         const int16_t **chrVSrc, int chrFilterSize,
 363                         const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest,
 364                         uint8_t *vDest, uint8_t *aDest,
 365                         int dstW, int chrDstW)
 366 {
 367     enum PixelFormat dstFormat = c->dstFormat;
 368
 369     //FIXME Optimize (just quickly written not optimized..)
 370     int i;
 371     for (i=0; i<dstW; i++) {
 372         int val=1<<18;
 373         int j;
 374         for (j=0; j<lumFilterSize; j++)
 375             val += lumSrc[j][i] * lumFilter[j];
 376
 377         dest[i]= av_clip_uint8(val>>19);
 378     }
 379
 380     if (!uDest)
 381         return;
 382
 383     if (dstFormat == PIX_FMT_NV12)
 384         for (i=0; i<chrDstW; i++) {
 385             int u=1<<18;
 386             int v=1<<18;
 387             int j;
 388             for (j=0; j<chrFilterSize; j++) {
 389                 u += chrUSrc[j][i] * chrFilter[j];
 390                 v += chrVSrc[j][i] * chrFilter[j];
 391             }
 392
 393             uDest[2*i]= av_clip_uint8(u>>19);
 394             uDest[2*i+1]= av_clip_uint8(v>>19);
 395         }
 396     else
 397         for (i=0; i<chrDstW; i++) {
 398             int u=1<<18;
 399             int v=1<<18;
 400             int j;
 401             for (j=0; j<chrFilterSize; j++) {
 402                 u += chrUSrc[j][i] * chrFilter[j];
 403                 v += chrVSrc[j][i] * chrFilter[j];
 404             }
 405
 406             uDest[2*i]= av_clip_uint8(v>>19);
 407             uDest[2*i+1]= av_clip_uint8(u>>19);
 408         }
 409 }
 410
 411 static av_always_inline void
 412 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 413                         const int16_t **lumSrc, int lumFilterSize,
 414                         const int16_t *chrFilter, const int16_t **chrUSrc,
 415                         const int16_t **chrVSrc, int chrFilterSize,
 416                         const int16_t **alpSrc, uint8_t *dest, int dstW,
 417                         int y, enum PixelFormat target)
 418 {
 419     int i;
 420
 421 #define output_pixel(pos, val) \
 422         if (target == PIX_FMT_GRAY16BE) { \
 423             AV_WB16(pos, val); \
 424         } else { \
 425             AV_WL16(pos, val); \
 426         }
 427     for (i = 0; i < (dstW >> 1); i++) {
 428         int j;
 429         int Y1 = 1 << 18;
 430         int Y2 = 1 << 18;
 431         const int i2 = 2 * i;
 432
 433         for (j = 0; j < lumFilterSize; j++) {
 434             Y1 += lumSrc[j][i2]   * lumFilter[j];
 435             Y2 += lumSrc[j][i2+1] * lumFilter[j];
 436         }
 437         Y1 >>= 11;
 438         Y2 >>= 11;
 439         if ((Y1 | Y2) & 0x10000) {
 440             Y1 = av_clip_uint16(Y1);
 441             Y2 = av_clip_uint16(Y2);
 442         }
 443         output_pixel(&dest[2 * i2 + 0], Y1);
 444         output_pixel(&dest[2 * i2 + 2], Y2);
 445     }
 446 }
 447
 448 static av_always_inline void
 449 yuv2gray16_2_c_template(SwsContext *c, const uint16_t *buf0,
 450                         const uint16_t *buf1, const uint16_t *ubuf0,
 451                         const uint16_t *ubuf1, const uint16_t *vbuf0,
 452                         const uint16_t *vbuf1, const uint16_t *abuf0,
 453                         const uint16_t *abuf1, uint8_t *dest, int dstW,
 454                         int yalpha, int uvalpha, int y,
 455                         enum PixelFormat target)
 456 {
 457     int  yalpha1 = 4095 - yalpha; \
 458     int i;
 459
 460     for (i = 0; i < (dstW >> 1); i++) {
 461         const int i2 = 2 * i;
 462         int Y1 = (buf0[i2  ] * yalpha1 + buf1[i2  ] * yalpha) >> 11;
 463         int Y2 = (buf0[i2+1] * yalpha1 + buf1[i2+1] * yalpha) >> 11;
 464
 465         output_pixel(&dest[2 * i2 + 0], Y1);
 466         output_pixel(&dest[2 * i2 + 2], Y2);
 467     }
 468 }
 469
 470 static av_always_inline void
 471 yuv2gray16_1_c_template(SwsContext *c, const uint16_t *buf0,
 472                         const uint16_t *ubuf0, const uint16_t *ubuf1,
 473                         const uint16_t *vbuf0, const uint16_t *vbuf1,
 474                         const uint16_t *abuf0, uint8_t *dest, int dstW,
 475                         int uvalpha, enum PixelFormat dstFormat,
 476                         int flags, int y, enum PixelFormat target)
 477 {
 478     int i;
 479
 480     for (i = 0; i < (dstW >> 1); i++) {
 481         const int i2 = 2 * i;
 482         int Y1 = buf0[i2  ] << 1;
 483         int Y2 = buf0[i2+1] << 1;
 484
 485         output_pixel(&dest[2 * i2 + 0], Y1);
 486         output_pixel(&dest[2 * i2 + 2], Y2);
 487     }
 488 #undef output_pixel
 489 }
 490
 491 #define YUV2PACKEDWRAPPER(name, ext, fmt) \
 492 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 493                         const int16_t **lumSrc, int lumFilterSize, \
 494                         const int16_t *chrFilter, const int16_t **chrUSrc, \
 495                         const int16_t **chrVSrc, int chrFilterSize, \
 496                         const int16_t **alpSrc, uint8_t *dest, int dstW, \
 497                         int y) \
 498 { \
 499     name ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 500                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 501                           alpSrc, dest, dstW, y, fmt); \
 502 } \
 503  \
 504 static void name ## ext ## _2_c(SwsContext *c, const uint16_t *buf0, \
 505                         const uint16_t *buf1, const uint16_t *ubuf0, \
 506                         const uint16_t *ubuf1, const uint16_t *vbuf0, \
 507                         const uint16_t *vbuf1, const uint16_t *abuf0, \
 508                         const uint16_t *abuf1, uint8_t *dest, int dstW, \
 509                         int yalpha, int uvalpha, int y) \
 510 { \
 511     name ## _2_c_template(c, buf0, buf1, ubuf0, ubuf1, \
 512                           vbuf0, vbuf1, abuf0, abuf1, \
 513                           dest, dstW, yalpha, uvalpha, y, fmt); \
 514 } \
 515  \
 516 static void name ## ext ## _1_c(SwsContext *c, const uint16_t *buf0, \
 517                         const uint16_t *ubuf0, const uint16_t *ubuf1, \
 518                         const uint16_t *vbuf0, const uint16_t *vbuf1, \
 519                         const uint16_t *abuf0, uint8_t *dest, int dstW, \
 520                         int uvalpha, enum PixelFormat dstFormat, \
 521                         int flags, int y) \
 522 { \
 523     name ## _1_c_template(c, buf0, ubuf0, ubuf1, vbuf0, \
 524                           vbuf1, abuf0, dest, dstW, uvalpha, \
 525                           dstFormat, flags, y, fmt); \
 526 }
 527
 528 YUV2PACKEDWRAPPER(yuv2gray16, LE, PIX_FMT_GRAY16LE);
 529 YUV2PACKEDWRAPPER(yuv2gray16, BE, PIX_FMT_GRAY16BE);
 530
 531 static av_always_inline void
 532 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 533                       const int16_t **lumSrc, int lumFilterSize,
 534                       const int16_t *chrFilter, const int16_t **chrUSrc,
 535                       const int16_t **chrVSrc, int chrFilterSize,
 536                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 537                       int y, enum PixelFormat target)
 538 {
 539     const uint8_t * const d128=dither_8x8_220[y&7];
 540     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 541     int i;
 542     int acc = 0;
 543
 544 #define output_pixel(pos, acc) \
 545     if (target == PIX_FMT_MONOBLACK) { \
 546         pos = acc; \
 547     } else { \
 548         pos = ~acc; \
 549     }
 550     for (i = 0; i < dstW - 1; i += 2) {
 551         int j;
 552         int Y1 = 1 << 18;
 553         int Y2 = 1 << 18;
 554
 555         for (j = 0; j < lumFilterSize; j++) {
 556             Y1 += lumSrc[j][i]   * lumFilter[j];
 557             Y2 += lumSrc[j][i+1] * lumFilter[j];
 558         }
 559         Y1 >>= 19;
 560         Y2 >>= 19;
 561         if ((Y1 | Y2) & 0x100) {
 562             Y1 = av_clip_uint8(Y1);
 563             Y2 = av_clip_uint8(Y2);
 564         }
 565         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 566         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 567         if ((i & 7) == 6) {
 568             output_pixel(*dest++, acc);
 569         }
 570     }
 571 }
 572
 573 static av_always_inline void
 574 yuv2mono_2_c_template(SwsContext *c, const uint16_t *buf0,
 575                       const uint16_t *buf1, const uint16_t *ubuf0,
 576                       const uint16_t *ubuf1, const uint16_t *vbuf0,
 577                       const uint16_t *vbuf1, const uint16_t *abuf0,
 578                       const uint16_t *abuf1, uint8_t *dest, int dstW,
 579                       int yalpha, int uvalpha, int y,
 580                       enum PixelFormat target)
 581 {
 582     const uint8_t * const d128 = dither_8x8_220[y & 7];
 583     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 584     int  yalpha1 = 4095 - yalpha;
 585     int i;
 586
 587     for (i = 0; i < dstW - 7; i += 8) {
 588         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 589         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 590         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 591         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 592         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 593         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 594         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 595         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 596         output_pixel(*dest++, acc);
 597     }
 598 }
 599
 600 static av_always_inline void
 601 yuv2mono_1_c_template(SwsContext *c, const uint16_t *buf0,
 602                       const uint16_t *ubuf0, const uint16_t *ubuf1,
 603                       const uint16_t *vbuf0, const uint16_t *vbuf1,
 604                       const uint16_t *abuf0, uint8_t *dest, int dstW,
 605                       int uvalpha, enum PixelFormat dstFormat,
 606                       int flags, int y, enum PixelFormat target)
 607 {
 608     const uint8_t * const d128 = dither_8x8_220[y & 7];
 609     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 610     int i;
 611
 612     for (i = 0; i < dstW - 7; i += 8) {
 613         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 614         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 615         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 616         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 617         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 618         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 619         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 620         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 621         output_pixel(*dest++, acc);
 622     }
 623 #undef output_pixel
 624 }
 625
 626 YUV2PACKEDWRAPPER(yuv2mono, white, PIX_FMT_MONOWHITE);
 627 YUV2PACKEDWRAPPER(yuv2mono, black, PIX_FMT_MONOBLACK);
 628
 629 #define YSCALE_YUV_2_PACKEDX_C(type,alpha) \
 630     for (i=0; i<(dstW>>1); i++) {\
 631         int j;\
 632         int Y1 = 1<<18;\
 633         int Y2 = 1<<18;\
 634         int U  = 1<<18;\
 635         int V  = 1<<18;\
 636         int av_unused A1, A2;\
 637         type av_unused *r, *b, *g;\
 638         const int i2= 2*i;\
 639         \
 640         for (j=0; j<lumFilterSize; j++) {\
 641             Y1 += lumSrc[j][i2] * lumFilter[j];\
 642             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 643         }\
 644         for (j=0; j<chrFilterSize; j++) {\
 645             U += chrUSrc[j][i] * chrFilter[j];\
 646             V += chrVSrc[j][i] * chrFilter[j];\
 647         }\
 648         Y1>>=19;\
 649         Y2>>=19;\
 650         U >>=19;\
 651         V >>=19;\
 652         if ((Y1|Y2|U|V)&0x100) {\
 653             Y1 = av_clip_uint8(Y1); \
 654             Y2 = av_clip_uint8(Y2); \
 655             U  = av_clip_uint8(U); \
 656             V  = av_clip_uint8(V); \
 657         }\
 658         if (alpha) {\
 659             A1 = 1<<18;\
 660             A2 = 1<<18;\
 661             for (j=0; j<lumFilterSize; j++) {\
 662                 A1 += alpSrc[j][i2  ] * lumFilter[j];\
 663                 A2 += alpSrc[j][i2+1] * lumFilter[j];\
 664             }\
 665             A1>>=19;\
 666             A2>>=19;\
 667             if ((A1|A2)&0x100) {\
 668                 A1 = av_clip_uint8(A1); \
 669                 A2 = av_clip_uint8(A2); \
 670             }\
 671         }
 672
 673 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
 674     for (i=0; i<dstW; i++) {\
 675         int j;\
 676         int Y = 0;\
 677         int U = -128<<19;\
 678         int V = -128<<19;\
 679         int av_unused A;\
 680         int R,G,B;\
 681         \
 682         for (j=0; j<lumFilterSize; j++) {\
 683             Y += lumSrc[j][i     ] * lumFilter[j];\
 684         }\
 685         for (j=0; j<chrFilterSize; j++) {\
 686             U += chrUSrc[j][i] * chrFilter[j];\
 687             V += chrVSrc[j][i] * chrFilter[j];\
 688         }\
 689         Y >>=10;\
 690         U >>=10;\
 691         V >>=10;\
 692         if (alpha) {\
 693             A = rnd;\
 694             for (j=0; j<lumFilterSize; j++)\
 695                 A += alpSrc[j][i     ] * lumFilter[j];\
 696             A >>=19;\
 697             if (A&0x100)\
 698                 A = av_clip_uint8(A);\
 699         }\
 700         Y-= c->yuv2rgb_y_offset;\
 701         Y*= c->yuv2rgb_y_coeff;\
 702         Y+= rnd;\
 703         R= Y + V*c->yuv2rgb_v2r_coeff;\
 704         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
 705         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
 706         if ((R|G|B)&(0xC0000000)) {\
 707             R = av_clip_uintp2(R, 30); \
 708             G = av_clip_uintp2(G, 30); \
 709             B = av_clip_uintp2(B, 30); \
 710         }
 711
 712 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
 713     YSCALE_YUV_2_PACKEDX_C(type,alpha)  /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
 714     r = (type *)c->table_rV[V];   \
 715     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
 716     b = (type *)c->table_bU[U];
 717
 718 #define YSCALE_YUV_2_PACKED2_C(type,alpha)   \
 719     for (i=0; i<(dstW>>1); i++) { \
 720         const int i2= 2*i;       \
 721         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>19;           \
 722         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;           \
 723         int U= (ubuf0[i]*uvalpha1+ubuf1[i]*uvalpha)>>19;              \
 724         int V= (vbuf0[i]*uvalpha1+vbuf1[i]*uvalpha)>>19;              \
 725         type av_unused *r, *b, *g;                                    \
 726         int av_unused A1, A2;                                         \
 727         if (alpha) {\
 728             A1= (abuf0[i2  ]*yalpha1+abuf1[i2  ]*yalpha)>>19;         \
 729             A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
 730         }
 731
 732 #define YSCALE_YUV_2_RGB2_C(type,alpha) \
 733     YSCALE_YUV_2_PACKED2_C(type,alpha)\
 734     r = (type *)c->table_rV[V];\
 735     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 736     b = (type *)c->table_bU[U];
 737
 738 #define YSCALE_YUV_2_PACKED1_C(type,alpha) \
 739     for (i=0; i<(dstW>>1); i++) {\
 740         const int i2= 2*i;\
 741         int Y1= buf0[i2  ]>>7;\
 742         int Y2= buf0[i2+1]>>7;\
 743         int U= (ubuf1[i])>>7;\
 744         int V= (vbuf1[i])>>7;\
 745         type av_unused *r, *b, *g;\
 746         int av_unused A1, A2;\
 747         if (alpha) {\
 748             A1= abuf0[i2  ]>>7;\
 749             A2= abuf0[i2+1]>>7;\
 750         }
 751
 752 #define YSCALE_YUV_2_RGB1_C(type,alpha) \
 753     YSCALE_YUV_2_PACKED1_C(type,alpha)\
 754     r = (type *)c->table_rV[V];\
 755     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 756     b = (type *)c->table_bU[U];
 757
 758 #define YSCALE_YUV_2_PACKED1B_C(type,alpha) \
 759     for (i=0; i<(dstW>>1); i++) {\
 760         const int i2= 2*i;\
 761         int Y1= buf0[i2  ]>>7;\
 762         int Y2= buf0[i2+1]>>7;\
 763         int U= (ubuf0[i] + ubuf1[i])>>8;\
 764         int V= (vbuf0[i] + vbuf1[i])>>8;\
 765         type av_unused *r, *b, *g;\
 766         int av_unused A1, A2;\
 767         if (alpha) {\
 768             A1= abuf0[i2  ]>>7;\
 769             A2= abuf0[i2+1]>>7;\
 770         }
 771
 772 #define YSCALE_YUV_2_RGB1B_C(type,alpha) \
 773     YSCALE_YUV_2_PACKED1B_C(type,alpha)\
 774     r = (type *)c->table_rV[V];\
 775     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 776     b = (type *)c->table_bU[U];
 777
 778 #define YSCALE_YUV_2_ANYRGB_C(func, func2)\
 779     switch(c->dstFormat) {\
 780     case PIX_FMT_RGB48BE:\
 781     case PIX_FMT_RGB48LE:\
 782         func(uint8_t,0)\
 783             ((uint8_t*)dest)[ 0]= r[Y1];\
 784             ((uint8_t*)dest)[ 1]= r[Y1];\
 785             ((uint8_t*)dest)[ 2]= g[Y1];\
 786             ((uint8_t*)dest)[ 3]= g[Y1];\
 787             ((uint8_t*)dest)[ 4]= b[Y1];\
 788             ((uint8_t*)dest)[ 5]= b[Y1];\
 789             ((uint8_t*)dest)[ 6]= r[Y2];\
 790             ((uint8_t*)dest)[ 7]= r[Y2];\
 791             ((uint8_t*)dest)[ 8]= g[Y2];\
 792             ((uint8_t*)dest)[ 9]= g[Y2];\
 793             ((uint8_t*)dest)[10]= b[Y2];\
 794             ((uint8_t*)dest)[11]= b[Y2];\
 795             dest+=12;\
 796         }\
 797         break;\
 798     case PIX_FMT_BGR48BE:\
 799     case PIX_FMT_BGR48LE:\
 800         func(uint8_t,0)\
 801             ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
 802             ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
 803             ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
 804             ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
 805             ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
 806             ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
 807             dest+=12;\
 808         }\
 809         break;\
 810     case PIX_FMT_RGBA:\
 811     case PIX_FMT_BGRA:\
 812         if (CONFIG_SMALL) {\
 813             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 814             func(uint32_t,needAlpha)\
 815                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? (A1<<24) : 0);\
 816                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? (A2<<24) : 0);\
 817             }\
 818         } else {\
 819             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 820                 func(uint32_t,1)\
 821                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
 822                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
 823                 }\
 824             } else {\
 825                 func(uint32_t,0)\
 826                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 827                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 828                 }\
 829             }\
 830         }\
 831         break;\
 832     case PIX_FMT_ARGB:\
 833     case PIX_FMT_ABGR:\
 834         if (CONFIG_SMALL) {\
 835             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 836             func(uint32_t,needAlpha)\
 837                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? A1 : 0);\
 838                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? A2 : 0);\
 839             }\
 840         } else {\
 841             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 842                 func(uint32_t,1)\
 843                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + A1;\
 844                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + A2;\
 845                 }\
 846             } else {\
 847                 func(uint32_t,0)\
 848                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 849                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 850                 }\
 851             }\
 852         }                \
 853         break;\
 854     case PIX_FMT_RGB24:\
 855         func(uint8_t,0)\
 856             ((uint8_t*)dest)[0]= r[Y1];\
 857             ((uint8_t*)dest)[1]= g[Y1];\
 858             ((uint8_t*)dest)[2]= b[Y1];\
 859             ((uint8_t*)dest)[3]= r[Y2];\
 860             ((uint8_t*)dest)[4]= g[Y2];\
 861             ((uint8_t*)dest)[5]= b[Y2];\
 862             dest+=6;\
 863         }\
 864         break;\
 865     case PIX_FMT_BGR24:\
 866         func(uint8_t,0)\
 867             ((uint8_t*)dest)[0]= b[Y1];\
 868             ((uint8_t*)dest)[1]= g[Y1];\
 869             ((uint8_t*)dest)[2]= r[Y1];\
 870             ((uint8_t*)dest)[3]= b[Y2];\
 871             ((uint8_t*)dest)[4]= g[Y2];\
 872             ((uint8_t*)dest)[5]= r[Y2];\
 873             dest+=6;\
 874         }\
 875         break;\
 876     case PIX_FMT_RGB565:\
 877     case PIX_FMT_BGR565:\
 878         {\
 879             const int dr1= dither_2x2_8[y&1    ][0];\
 880             const int dg1= dither_2x2_4[y&1    ][0];\
 881             const int db1= dither_2x2_8[(y&1)^1][0];\
 882             const int dr2= dither_2x2_8[y&1    ][1];\
 883             const int dg2= dither_2x2_4[y&1    ][1];\
 884             const int db2= dither_2x2_8[(y&1)^1][1];\
 885             func(uint16_t,0)\
 886                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 887                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 888             }\
 889         }\
 890         break;\
 891     case PIX_FMT_RGB555:\
 892     case PIX_FMT_BGR555:\
 893         {\
 894             const int dr1= dither_2x2_8[y&1    ][0];\
 895             const int dg1= dither_2x2_8[y&1    ][1];\
 896             const int db1= dither_2x2_8[(y&1)^1][0];\
 897             const int dr2= dither_2x2_8[y&1    ][1];\
 898             const int dg2= dither_2x2_8[y&1    ][0];\
 899             const int db2= dither_2x2_8[(y&1)^1][1];\
 900             func(uint16_t,0)\
 901                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 902                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 903             }\
 904         }\
 905         break;\
 906     case PIX_FMT_RGB444:\
 907     case PIX_FMT_BGR444:\
 908         {\
 909             const int dr1= dither_4x4_16[y&3    ][0];\
 910             const int dg1= dither_4x4_16[y&3    ][1];\
 911             const int db1= dither_4x4_16[(y&3)^3][0];\
 912             const int dr2= dither_4x4_16[y&3    ][1];\
 913             const int dg2= dither_4x4_16[y&3    ][0];\
 914             const int db2= dither_4x4_16[(y&3)^3][1];\
 915             func(uint16_t,0)\
 916                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 917                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 918             }\
 919         }\
 920         break;\
 921     case PIX_FMT_RGB8:\
 922     case PIX_FMT_BGR8:\
 923         {\
 924             const uint8_t * const d64= dither_8x8_73[y&7];\
 925             const uint8_t * const d32= dither_8x8_32[y&7];\
 926             func(uint8_t,0)\
 927                 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
 928                 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
 929             }\
 930         }\
 931         break;\
 932     case PIX_FMT_RGB4:\
 933     case PIX_FMT_BGR4:\
 934         {\
 935             const uint8_t * const d64= dither_8x8_73 [y&7];\
 936             const uint8_t * const d128=dither_8x8_220[y&7];\
 937             func(uint8_t,0)\
 938                 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
 939                                  + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
 940             }\
 941         }\
 942         break;\
 943     case PIX_FMT_RGB4_BYTE:\
 944     case PIX_FMT_BGR4_BYTE:\
 945         {\
 946             const uint8_t * const d64= dither_8x8_73 [y&7];\
 947             const uint8_t * const d128=dither_8x8_220[y&7];\
 948             func(uint8_t,0)\
 949                 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
 950                 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
 951             }\
 952         }\
 953         break;\
 954     case PIX_FMT_YUYV422:\
 955         func2\
 956             ((uint8_t*)dest)[2*i2+0]= Y1;\
 957             ((uint8_t*)dest)[2*i2+1]= U;\
 958             ((uint8_t*)dest)[2*i2+2]= Y2;\
 959             ((uint8_t*)dest)[2*i2+3]= V;\
 960         }                \
 961         break;\
 962     case PIX_FMT_UYVY422:\
 963         func2\
 964             ((uint8_t*)dest)[2*i2+0]= U;\
 965             ((uint8_t*)dest)[2*i2+1]= Y1;\
 966             ((uint8_t*)dest)[2*i2+2]= V;\
 967             ((uint8_t*)dest)[2*i2+3]= Y2;\
 968         }                \
 969         break;\
 970     }
 971
 972 static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
 973                           const int16_t **lumSrc, int lumFilterSize,
 974                           const int16_t *chrFilter, const int16_t **chrUSrc,
 975                           const int16_t **chrVSrc, int chrFilterSize,
 976                           const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 977 {
 978     int i;
 979     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void,0))
 980 }
 981
 982 static void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
 983                             const int16_t **lumSrc, int lumFilterSize,
 984                             const int16_t *chrFilter, const int16_t **chrUSrc,
 985                             const int16_t **chrVSrc, int chrFilterSize,
 986                             const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
 987 {
 988     int i;
 989     int step= c->dstFormatBpp/8;
 990     int aidx= 3;
 991
 992     switch(c->dstFormat) {
 993     case PIX_FMT_ARGB:
 994         dest++;
 995         aidx= 0;
 996     case PIX_FMT_RGB24:
 997         aidx--;
 998     case PIX_FMT_RGBA:
 999         if (CONFIG_SMALL) {
1000             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
1001             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
1002                 dest[aidx]= needAlpha ? A : 255;
1003                 dest[0]= R>>22;
1004                 dest[1]= G>>22;
1005                 dest[2]= B>>22;
1006                 dest+= step;
1007             }
1008         } else {
1009             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1010                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
1011                     dest[aidx]= A;
1012                     dest[0]= R>>22;
1013                     dest[1]= G>>22;
1014                     dest[2]= B>>22;
1015                     dest+= step;
1016                 }
1017             } else {
1018                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
1019                     dest[aidx]= 255;
1020                     dest[0]= R>>22;
1021                     dest[1]= G>>22;
1022                     dest[2]= B>>22;
1023                     dest+= step;
1024                 }
1025             }
1026         }
1027         break;
1028     case PIX_FMT_ABGR:
1029         dest++;
1030         aidx= 0;
1031     case PIX_FMT_BGR24:
1032         aidx--;
1033     case PIX_FMT_BGRA:
1034         if (CONFIG_SMALL) {
1035             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
1036             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
1037                 dest[aidx]= needAlpha ? A : 255;
1038                 dest[0]= B>>22;
1039                 dest[1]= G>>22;
1040                 dest[2]= R>>22;
1041                 dest+= step;
1042             }
1043         } else {
1044             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1045                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
1046                     dest[aidx]= A;
1047                     dest[0]= B>>22;
1048                     dest[1]= G>>22;
1049                     dest[2]= R>>22;
1050                     dest+= step;
1051                 }
1052             } else {
1053                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
1054                     dest[aidx]= 255;
1055                     dest[0]= B>>22;
1056                     dest[1]= G>>22;
1057                     dest[2]= R>>22;
1058                     dest+= step;
1059                 }
1060             }
1061         }
1062         break;
1063     default:
1064         assert(0);
1065     }
1066 }
1067
1068 /**
1069  * vertical bilinear scale YV12 to RGB
1070  */
1071 static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
1072                           const uint16_t *buf1, const uint16_t *ubuf0,
1073                           const uint16_t *ubuf1, const uint16_t *vbuf0,
1074                           const uint16_t *vbuf1, const uint16_t *abuf0,
1075                           const uint16_t *abuf1, uint8_t *dest, int dstW,
1076                           int yalpha, int uvalpha, int y)
1077 {
1078     int  yalpha1=4095- yalpha;
1079     int uvalpha1=4095-uvalpha;
1080     int i;
1081
1082     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0))
1083 }
1084
1085 /**
1086  * YV12 to RGB without scaling or interpolating
1087  */
1088 static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
1089                           const uint16_t *ubuf0, const uint16_t *ubuf1,
1090                           const uint16_t *vbuf0, const uint16_t *vbuf1,
1091                           const uint16_t *abuf0, uint8_t *dest, int dstW,
1092                           int uvalpha, enum PixelFormat dstFormat,
1093                           int flags, int y)
1094 {
1095     int i;
1096
1097     if (uvalpha < 2048) {
1098         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0))
1099     } else {
1100         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0))
1101     }
1102 }
1103
1104 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1105                                        int width, int height,
1106                                        int y, uint8_t val)
1107 {
1108     int i;
1109     uint8_t *ptr = plane + stride*y;
1110     for (i=0; i<height; i++) {
1111         memset(ptr, val, width);
1112         ptr += stride;
1113     }
1114 }
1115
1116 static av_always_inline void
1117 rgb48ToY_c_template(uint8_t *dst, const uint8_t *src, int width,
1118                     enum PixelFormat origin)
1119 {
1120     int i;
1121     for (i = 0; i < width; i++) {
1122 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1123         int a = input_pixel(&src[i*6+0]) >> 8;
1124         int g = input_pixel(&src[i*6+2]) >> 8;
1125         int c = input_pixel(&src[i*6+4]) >> 8;
1126
1127 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? c : a)
1128 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? a : c)
1129         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1130     }
1131 }
1132
1133 static av_always_inline void
1134 rgb48ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1135                     const uint8_t *src1, const uint8_t *src2,
1136                     int width, enum PixelFormat origin)
1137 {
1138     int i;
1139     assert(src1==src2);
1140     for (i = 0; i < width; i++) {
1141         int a = input_pixel(&src1[6*i + 0]) >> 8;
1142         int g = input_pixel(&src1[6*i + 2]) >> 8;
1143         int c = input_pixel(&src1[6*i + 4]) >> 8;
1144
1145         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1146         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1147     }
1148 }
1149
1150 static av_always_inline void
1151 rgb48ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1152                           const uint8_t *src1, const uint8_t *src2,
1153                           int width, enum PixelFormat origin)
1154 {
1155     int i;
1156     assert(src1==src2);
1157     for (i = 0; i < width; i++) {
1158         int a = (input_pixel(&src1[12*i + 0]) >> 8) + (input_pixel(&src1[12*i + 6]) >> 8);
1159         int g = (input_pixel(&src1[12*i + 2]) >> 8) + (input_pixel(&src1[12*i + 8]) >> 8);
1160         int c = (input_pixel(&src1[12*i + 4]) >> 8) + (input_pixel(&src1[12*i + 10]) >> 8);
1161
1162         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1163         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1164     }
1165 #undef r
1166 #undef b
1167 #undef input_pixel
1168 }
1169
1170 #define rgb48funcs(pattern, BE_LE, origin) \
1171 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *dst, const uint8_t *src, \
1172                                     int width, uint32_t *unused) \
1173 { \
1174     rgb48ToY_c_template(dst, src, width, origin); \
1175 } \
1176  \
1177 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1178                                     const uint8_t *src1, const uint8_t *src2, \
1179                                     int width, uint32_t *unused) \
1180 { \
1181     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1182 } \
1183  \
1184 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1185                                     const uint8_t *src1, const uint8_t *src2, \
1186                                     int width, uint32_t *unused) \
1187 { \
1188     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1189 }
1190
1191 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1192 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1193 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1194 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1195
1196 static av_always_inline void
1197 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1198                        int width, enum PixelFormat origin,
1199                        int shr,   int shg,   int shb, int shp,
1200                        int maskr, int maskg, int maskb,
1201                        int rsh,   int gsh,   int bsh, int S)
1202 {
1203     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1204               rnd = 33 << (S - 1);
1205     int i;
1206
1207     for (i = 0; i < width; i++) {
1208 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1209                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1210                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1211         int px = input_pixel(i) >> shp;
1212         int b = (px & maskb) >> shb;
1213         int g = (px & maskg) >> shg;
1214         int r = (px & maskr) >> shr;
1215
1216         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1217     }
1218 }
1219
1220 static av_always_inline void
1221 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1222                         const uint8_t *src, int width,
1223                         enum PixelFormat origin,
1224                         int shr,   int shg,   int shb, int shp,
1225                         int maskr, int maskg, int maskb,
1226                         int rsh,   int gsh,   int bsh, int S)
1227 {
1228     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1229               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1230               rnd = 257 << (S - 1);
1231     int i;
1232
1233     for (i = 0; i < width; i++) {
1234         int px = input_pixel(i) >> shp;
1235         int b = (px & maskb) >> shb;
1236         int g = (px & maskg) >> shg;
1237         int r = (px & maskr) >> shr;
1238
1239         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1240         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1241     }
1242 }
1243
1244 static av_always_inline void
1245 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1246                              const uint8_t *src, int width,
1247                              enum PixelFormat origin,
1248                              int shr,   int shg,   int shb, int shp,
1249                              int maskr, int maskg, int maskb,
1250                              int rsh,   int gsh,   int bsh, int S)
1251 {
1252     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1253               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1254               rnd = 257 << S, maskgx = ~(maskr | maskb);
1255     int i;
1256
1257     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1258     for (i = 0; i < width; i++) {
1259         int px0 = input_pixel(2 * i + 0) >> shp;
1260         int px1 = input_pixel(2 * i + 1) >> shp;
1261         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1262         int rb = px0 + px1 - g;
1263
1264         b = (rb & maskb) >> shb;
1265         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1266             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1267             g >>= shg;
1268         } else {
1269             g = (g  & maskg) >> shg;
1270         }
1271         r = (rb & maskr) >> shr;
1272
1273         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1274         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1275     }
1276 #undef input_pixel
1277 }
1278
1279 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1280                          maskg, maskb, rsh, gsh, bsh, S) \
1281 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1282                           int width, uint32_t *unused) \
1283 { \
1284     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1285                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1286 } \
1287  \
1288 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1289                            const uint8_t *src, const uint8_t *dummy, \
1290                            int width, uint32_t *unused) \
1291 { \
1292     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1293                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1294 } \
1295  \
1296 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1297                                 const uint8_t *src, const uint8_t *dummy, \
1298                                 int width, uint32_t *unused) \
1299 { \
1300     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1301                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1302 }
1303
1304 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1305 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1306 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1307 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1308 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1309 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1310 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1311 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1312 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1313 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1314 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1315 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1316
1317 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1318 {
1319     int i;
1320     for (i=0; i<width; i++) {
1321         dst[i]= src[4*i];
1322     }
1323 }
1324
1325 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1326 {
1327     int i;
1328     for (i=0; i<width; i++) {
1329         dst[i]= src[4*i+3];
1330     }
1331 }
1332
1333 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1334 {
1335     int i;
1336     for (i=0; i<width; i++) {
1337         int d= src[i];
1338
1339         dst[i]= pal[d] & 0xFF;
1340     }
1341 }
1342
1343 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1344                       const uint8_t *src1, const uint8_t *src2,
1345                       int width, uint32_t *pal)
1346 {
1347     int i;
1348     assert(src1 == src2);
1349     for (i=0; i<width; i++) {
1350         int p= pal[src1[i]];
1351
1352         dstU[i]= p>>8;
1353         dstV[i]= p>>16;
1354     }
1355 }
1356
1357 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1358                           int width, uint32_t *unused)
1359 {
1360     int i, j;
1361     for (i=0; i<width/8; i++) {
1362         int d= ~src[i];
1363         for(j=0; j<8; j++)
1364             dst[8*i+j]= ((d>>(7-j))&1)*255;
1365     }
1366 }
1367
1368 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1369                           int width, uint32_t *unused)
1370 {
1371     int i, j;
1372     for (i=0; i<width/8; i++) {
1373         int d= src[i];
1374         for(j=0; j<8; j++)
1375             dst[8*i+j]= ((d>>(7-j))&1)*255;
1376     }
1377 }
1378
1379 //FIXME yuy2* can read up to 7 samples too much
1380
1381 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1382                       uint32_t *unused)
1383 {
1384     int i;
1385     for (i=0; i<width; i++)
1386         dst[i]= src[2*i];
1387 }
1388
1389 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1390                        const uint8_t *src2, int width, uint32_t *unused)
1391 {
1392     int i;
1393     for (i=0; i<width; i++) {
1394         dstU[i]= src1[4*i + 1];
1395         dstV[i]= src1[4*i + 3];
1396     }
1397     assert(src1 == src2);
1398 }
1399
1400 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1401                      const uint8_t *src2, int width, uint32_t *unused)
1402 {
1403     int i;
1404     for (i=0; i<width; i++) {
1405         dstU[i]= src1[2*i + 1];
1406         dstV[i]= src2[2*i + 1];
1407     }
1408 }
1409
1410 /* This is almost identical to the previous, end exists only because
1411  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1412 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1413                       uint32_t *unused)
1414 {
1415     int i;
1416     for (i=0; i<width; i++)
1417         dst[i]= src[2*i+1];
1418 }
1419
1420 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1421                        const uint8_t *src2, int width, uint32_t *unused)
1422 {
1423     int i;
1424     for (i=0; i<width; i++) {
1425         dstU[i]= src1[4*i + 0];
1426         dstV[i]= src1[4*i + 2];
1427     }
1428     assert(src1 == src2);
1429 }
1430
1431 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1432                      const uint8_t *src2, int width, uint32_t *unused)
1433 {
1434     int i;
1435     for (i=0; i<width; i++) {
1436         dstU[i]= src1[2*i];
1437         dstV[i]= src2[2*i];
1438     }
1439 }
1440
1441 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1442                                         const uint8_t *src, int width)
1443 {
1444     int i;
1445     for (i = 0; i < width; i++) {
1446         dst1[i] = src[2*i+0];
1447         dst2[i] = src[2*i+1];
1448     }
1449 }
1450
1451 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1452                        const uint8_t *src1, const uint8_t *src2,
1453                        int width, uint32_t *unused)
1454 {
1455     nvXXtoUV_c(dstU, dstV, src1, width);
1456 }
1457
1458 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1459                        const uint8_t *src1, const uint8_t *src2,
1460                        int width, uint32_t *unused)
1461 {
1462     nvXXtoUV_c(dstV, dstU, src1, width);
1463 }
1464
1465 // FIXME Maybe dither instead.
1466 static av_always_inline void
1467 yuv9_OR_10ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1468                           const uint8_t *_srcU, const uint8_t *_srcV,
1469                           int width, enum PixelFormat origin, int depth)
1470 {
1471     int i;
1472     const uint16_t *srcU = (const uint16_t *) _srcU;
1473     const uint16_t *srcV = (const uint16_t *) _srcV;
1474
1475 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1476     for (i = 0; i < width; i++) {
1477         dstU[i] = input_pixel(&srcU[i]) >> (depth - 8);
1478         dstV[i] = input_pixel(&srcV[i]) >> (depth - 8);
1479     }
1480 }
1481
1482 static av_always_inline void
1483 yuv9_or_10ToY_c_template(uint8_t *dstY, const uint8_t *_srcY,
1484                          int width, enum PixelFormat origin, int depth)
1485 {
1486     int i;
1487     const uint16_t *srcY = (const uint16_t*)_srcY;
1488
1489     for (i = 0; i < width; i++)
1490         dstY[i] = input_pixel(&srcY[i]) >> (depth - 8);
1491 #undef input_pixel
1492 }
1493
1494 #define YUV_NBPS(depth, BE_LE, origin) \
1495 static void BE_LE ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1496                                      const uint8_t *srcU, const uint8_t *srcV, \
1497                                      int width, uint32_t *unused) \
1498 { \
1499     yuv9_OR_10ToUV_c_template(dstU, dstV, srcU, srcV, width, origin, depth); \
1500 } \
1501 static void BE_LE ## depth ## ToY_c(uint8_t *dstY, const uint8_t *srcY, \
1502                                     int width, uint32_t *unused) \
1503 { \
1504     yuv9_or_10ToY_c_template(dstY, srcY, width, origin, depth); \
1505 }
1506
1507 YUV_NBPS( 9, LE, PIX_FMT_YUV420P9LE);
1508 YUV_NBPS( 9, BE, PIX_FMT_YUV420P9BE);
1509 YUV_NBPS(10, LE, PIX_FMT_YUV420P10LE);
1510 YUV_NBPS(10, BE, PIX_FMT_YUV420P10BE);
1511
1512 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1513                        int width, uint32_t *unused)
1514 {
1515     int i;
1516     for (i=0; i<width; i++) {
1517         int b= src[i*3+0];
1518         int g= src[i*3+1];
1519         int r= src[i*3+2];
1520
1521         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1522     }
1523 }
1524
1525 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1526                         const uint8_t *src2, int width, uint32_t *unused)
1527 {
1528     int i;
1529     for (i=0; i<width; i++) {
1530         int b= src1[3*i + 0];
1531         int g= src1[3*i + 1];
1532         int r= src1[3*i + 2];
1533
1534         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1535         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1536     }
1537     assert(src1 == src2);
1538 }
1539
1540 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1541                              const uint8_t *src2, int width, uint32_t *unused)
1542 {
1543     int i;
1544     for (i=0; i<width; i++) {
1545         int b= src1[6*i + 0] + src1[6*i + 3];
1546         int g= src1[6*i + 1] + src1[6*i + 4];
1547         int r= src1[6*i + 2] + src1[6*i + 5];
1548
1549         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1550         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1551     }
1552     assert(src1 == src2);
1553 }
1554
1555 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1556                        uint32_t *unused)
1557 {
1558     int i;
1559     for (i=0; i<width; i++) {
1560         int r= src[i*3+0];
1561         int g= src[i*3+1];
1562         int b= src[i*3+2];
1563
1564         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1565     }
1566 }
1567
1568 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1569                         const uint8_t *src2, int width, uint32_t *unused)
1570 {
1571     int i;
1572     assert(src1==src2);
1573     for (i=0; i<width; i++) {
1574         int r= src1[3*i + 0];
1575         int g= src1[3*i + 1];
1576         int b= src1[3*i + 2];
1577
1578         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1579         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1580     }
1581 }
1582
1583 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1584                              const uint8_t *src2, int width, uint32_t *unused)
1585 {
1586     int i;
1587     assert(src1==src2);
1588     for (i=0; i<width; i++) {
1589         int r= src1[6*i + 0] + src1[6*i + 3];
1590         int g= src1[6*i + 1] + src1[6*i + 4];
1591         int b= src1[6*i + 2] + src1[6*i + 5];
1592
1593         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1594         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1595     }
1596 }
1597
1598
1599 // bilinear / bicubic scaling
1600 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1601                      int srcW, int xInc,
1602                      const int16_t *filter, const int16_t *filterPos,
1603                      int filterSize)
1604 {
1605     int i;
1606     for (i=0; i<dstW; i++) {
1607         int j;
1608         int srcPos= filterPos[i];
1609         int val=0;
1610         for (j=0; j<filterSize; j++) {
1611             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1612         }
1613         //filter += hFilterSize;
1614         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1615         //dst[i] = val>>7;
1616     }
1617 }
1618
1619 //FIXME all pal and rgb srcFormats could do this convertion as well
1620 //FIXME all scalers more complex than bilinear could do half of this transform
1621 static void chrRangeToJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1622 {
1623     int i;
1624     for (i = 0; i < width; i++) {
1625         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1626         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1627     }
1628 }
1629 static void chrRangeFromJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1630 {
1631     int i;
1632     for (i = 0; i < width; i++) {
1633         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1634         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1635     }
1636 }
1637 static void lumRangeToJpeg_c(uint16_t *dst, int width)
1638 {
1639     int i;
1640     for (i = 0; i < width; i++)
1641         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1642 }
1643 static void lumRangeFromJpeg_c(uint16_t *dst, int width)
1644 {
1645     int i;
1646     for (i = 0; i < width; i++)
1647         dst[i] = (dst[i]*14071 + 33561947)>>14;
1648 }
1649
1650 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1651                            const uint8_t *src, int srcW, int xInc)
1652 {
1653     int i;
1654     unsigned int xpos=0;
1655     for (i=0;i<dstWidth;i++) {
1656         register unsigned int xx=xpos>>16;
1657         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1658         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1659         xpos+=xInc;
1660     }
1661 }
1662
1663 // *** horizontal scale Y line to temp buffer
1664 static av_always_inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1665                                      const uint8_t *src, int srcW, int xInc,
1666                                      const int16_t *hLumFilter,
1667                                      const int16_t *hLumFilterPos, int hLumFilterSize,
1668                                      uint8_t *formatConvBuffer,
1669                                      uint32_t *pal, int isAlpha)
1670 {
1671     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1672     void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1673
1674     if (toYV12) {
1675         toYV12(formatConvBuffer, src, srcW, pal);
1676         src= formatConvBuffer;
1677     }
1678
1679     if (!c->hyscale_fast) {
1680         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1681     } else { // fast bilinear upscale / crap downscale
1682         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1683     }
1684
1685     if (convertRange)
1686         convertRange(dst, dstWidth);
1687 }
1688
1689 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1690                            int dstWidth, const uint8_t *src1,
1691                            const uint8_t *src2, int srcW, int xInc)
1692 {
1693     int i;
1694     unsigned int xpos=0;
1695     for (i=0;i<dstWidth;i++) {
1696         register unsigned int xx=xpos>>16;
1697         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1698         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1699         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1700         xpos+=xInc;
1701     }
1702 }
1703
1704 static av_always_inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1705                                      const uint8_t *src1, const uint8_t *src2,
1706                                      int srcW, int xInc, const int16_t *hChrFilter,
1707                                      const int16_t *hChrFilterPos, int hChrFilterSize,
1708                                      uint8_t *formatConvBuffer, uint32_t *pal)
1709 {
1710     if (c->chrToYV12) {
1711         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1712         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1713         src1= formatConvBuffer;
1714         src2= buf2;
1715     }
1716
1717     if (!c->hcscale_fast) {
1718         c->hScale(dst1, dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1719         c->hScale(dst2, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1720     } else { // fast bilinear upscale / crap downscale
1721         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1722     }
1723
1724     if (c->chrConvertRange)
1725         c->chrConvertRange(dst1, dst2, dstWidth);
1726 }
1727
1728 static av_always_inline void
1729 find_c_packed_planar_out_funcs(SwsContext *c,
1730                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
1731                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
1732                                yuv2packedX_fn *yuv2packedX)
1733 {
1734     enum PixelFormat dstFormat = c->dstFormat;
1735
1736     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1737         *yuv2yuvX     = yuv2nv12X_c;
1738     } else if (is16BPS(dstFormat)) {
1739         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
1740     } else if (is9_OR_10BPS(dstFormat)) {
1741         if (dstFormat == PIX_FMT_YUV420P9BE || dstFormat == PIX_FMT_YUV420P9LE) {
1742             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
1743         } else {
1744             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
1745         }
1746     } else {
1747         *yuv2yuv1     = yuv2yuv1_c;
1748         *yuv2yuvX     = yuv2yuvX_c;
1749     }
1750     if(c->flags & SWS_FULL_CHR_H_INT) {
1751         *yuv2packedX = yuv2rgbX_c_full;
1752     } else {
1753         switch (dstFormat) {
1754         case PIX_FMT_GRAY16BE:
1755             *yuv2packed1 = yuv2gray16BE_1_c;
1756             *yuv2packed2 = yuv2gray16BE_2_c;
1757             *yuv2packedX = yuv2gray16BE_X_c;
1758             break;
1759         case PIX_FMT_GRAY16LE:
1760             *yuv2packed1 = yuv2gray16LE_1_c;
1761             *yuv2packed2 = yuv2gray16LE_2_c;
1762             *yuv2packedX = yuv2gray16LE_X_c;
1763             break;
1764         case PIX_FMT_MONOWHITE:
1765             *yuv2packed1 = yuv2monowhite_1_c;
1766             *yuv2packed2 = yuv2monowhite_2_c;
1767             *yuv2packedX = yuv2monowhite_X_c;
1768             break;
1769         case PIX_FMT_MONOBLACK:
1770             *yuv2packed1 = yuv2monoblack_1_c;
1771             *yuv2packed2 = yuv2monoblack_2_c;
1772             *yuv2packedX = yuv2monoblack_X_c;
1773             break;
1774         default:
1775             *yuv2packed1 = yuv2packed1_c;
1776             *yuv2packed2 = yuv2packed2_c;
1777             *yuv2packedX = yuv2packedX_c;
1778             break;
1779         }
1780     }
1781 }
1782
1783 #define DEBUG_SWSCALE_BUFFERS 0
1784 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
1785
1786 static int swScale(SwsContext *c, const uint8_t* src[],
1787                    int srcStride[], int srcSliceY,
1788                    int srcSliceH, uint8_t* dst[], int dstStride[])
1789 {
1790     /* load a few things into local vars to make the code more readable? and faster */
1791     const int srcW= c->srcW;
1792     const int dstW= c->dstW;
1793     const int dstH= c->dstH;
1794     const int chrDstW= c->chrDstW;
1795     const int chrSrcW= c->chrSrcW;
1796     const int lumXInc= c->lumXInc;
1797     const int chrXInc= c->chrXInc;
1798     const enum PixelFormat dstFormat= c->dstFormat;
1799     const int flags= c->flags;
1800     int16_t *vLumFilterPos= c->vLumFilterPos;
1801     int16_t *vChrFilterPos= c->vChrFilterPos;
1802     int16_t *hLumFilterPos= c->hLumFilterPos;
1803     int16_t *hChrFilterPos= c->hChrFilterPos;
1804     int16_t *vLumFilter= c->vLumFilter;
1805     int16_t *vChrFilter= c->vChrFilter;
1806     int16_t *hLumFilter= c->hLumFilter;
1807     int16_t *hChrFilter= c->hChrFilter;
1808     int32_t *lumMmxFilter= c->lumMmxFilter;
1809     int32_t *chrMmxFilter= c->chrMmxFilter;
1810     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
1811     const int vLumFilterSize= c->vLumFilterSize;
1812     const int vChrFilterSize= c->vChrFilterSize;
1813     const int hLumFilterSize= c->hLumFilterSize;
1814     const int hChrFilterSize= c->hChrFilterSize;
1815     int16_t **lumPixBuf= c->lumPixBuf;
1816     int16_t **chrUPixBuf= c->chrUPixBuf;
1817     int16_t **chrVPixBuf= c->chrVPixBuf;
1818     int16_t **alpPixBuf= c->alpPixBuf;
1819     const int vLumBufSize= c->vLumBufSize;
1820     const int vChrBufSize= c->vChrBufSize;
1821     uint8_t *formatConvBuffer= c->formatConvBuffer;
1822     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
1823     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
1824     int lastDstY;
1825     uint32_t *pal=c->pal_yuv;
1826     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
1827     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
1828     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
1829     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
1830     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
1831
1832     /* vars which will change and which we need to store back in the context */
1833     int dstY= c->dstY;
1834     int lumBufIndex= c->lumBufIndex;
1835     int chrBufIndex= c->chrBufIndex;
1836     int lastInLumBuf= c->lastInLumBuf;
1837     int lastInChrBuf= c->lastInChrBuf;
1838
1839     if (isPacked(c->srcFormat)) {
1840         src[0]=
1841         src[1]=
1842         src[2]=
1843         src[3]= src[0];
1844         srcStride[0]=
1845         srcStride[1]=
1846         srcStride[2]=
1847         srcStride[3]= srcStride[0];
1848     }
1849     srcStride[1]<<= c->vChrDrop;
1850     srcStride[2]<<= c->vChrDrop;
1851
1852     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
1853                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
1854                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
1855     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
1856                    srcSliceY,    srcSliceH,    dstY,    dstH);
1857     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
1858                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
1859
1860     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
1861         static int warnedAlready=0; //FIXME move this into the context perhaps
1862         if (flags & SWS_PRINT_INFO && !warnedAlready) {
1863             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
1864                    "         ->cannot do aligned memory accesses anymore\n");
1865             warnedAlready=1;
1866         }
1867     }
1868
1869     /* Note the user might start scaling the picture in the middle so this
1870        will not get executed. This is not really intended but works
1871        currently, so people might do it. */
1872     if (srcSliceY ==0) {
1873         lumBufIndex=-1;
1874         chrBufIndex=-1;
1875         dstY=0;
1876         lastInLumBuf= -1;
1877         lastInChrBuf= -1;
1878     }
1879
1880     lastDstY= dstY;
1881
1882     for (;dstY < dstH; dstY++) {
1883         unsigned char *dest =dst[0]+dstStride[0]*dstY;
1884         const int chrDstY= dstY>>c->chrDstVSubSample;
1885         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
1886         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
1887         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
1888
1889         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
1890         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
1891         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
1892         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
1893         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
1894         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
1895         int enough_lines;
1896
1897         //handle holes (FAST_BILINEAR & weird filters)
1898         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1899         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1900         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
1901         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
1902
1903         DEBUG_BUFFERS("dstY: %d\n", dstY);
1904         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
1905                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
1906         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
1907                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
1908
1909         // Do we have enough lines in this slice to output the dstY line
1910         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
1911
1912         if (!enough_lines) {
1913             lastLumSrcY = srcSliceY + srcSliceH - 1;
1914             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
1915             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
1916                                             lastLumSrcY, lastChrSrcY);
1917         }
1918
1919         //Do horizontal scaling
1920         while(lastInLumBuf < lastLumSrcY) {
1921             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
1922             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
1923             lumBufIndex++;
1924             assert(lumBufIndex < 2*vLumBufSize);
1925             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
1926             assert(lastInLumBuf + 1 - srcSliceY >= 0);
1927             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
1928                     hLumFilter, hLumFilterPos, hLumFilterSize,
1929                     formatConvBuffer,
1930                     pal, 0);
1931             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
1932                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
1933                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
1934                         formatConvBuffer,
1935                         pal, 1);
1936             lastInLumBuf++;
1937             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
1938                                lumBufIndex,    lastInLumBuf);
1939         }
1940         while(lastInChrBuf < lastChrSrcY) {
1941             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
1942             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
1943             chrBufIndex++;
1944             assert(chrBufIndex < 2*vChrBufSize);
1945             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
1946             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
1947             //FIXME replace parameters through context struct (some at least)
1948
1949             if (c->needs_hcscale)
1950                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
1951                           chrDstW, src1, src2, chrSrcW, chrXInc,
1952                           hChrFilter, hChrFilterPos, hChrFilterSize,
1953                           formatConvBuffer, pal);
1954             lastInChrBuf++;
1955             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
1956                                chrBufIndex,    lastInChrBuf);
1957         }
1958         //wrap buf index around to stay inside the ring buffer
1959         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
1960         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
1961         if (!enough_lines)
1962             break; //we can't output a dstY line so let's try with the next slice
1963
1964 #if HAVE_MMX
1965         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
1966 #endif
1967         if (dstY >= dstH-2) {
1968             // hmm looks like we can't use MMX here without overwriting this array's tail
1969             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
1970                                            &yuv2packed1, &yuv2packed2,
1971                                            &yuv2packedX);
1972         }
1973
1974         {
1975             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
1976             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1977             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
1978             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
1979             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
1980                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
1981                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
1982                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
1983                     const int16_t *lumBuf = lumSrcPtr[0];
1984                     const int16_t *chrUBuf= chrUSrcPtr[0];
1985                     const int16_t *chrVBuf= chrVSrcPtr[0];
1986                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
1987                     yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
1988                                 uDest, vDest, aDest, dstW, chrDstW);
1989                 } else { //General YV12
1990                     yuv2yuvX(c,
1991                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
1992                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
1993                                 chrVSrcPtr, vChrFilterSize,
1994                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
1995                 }
1996             } else {
1997                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
1998                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
1999                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2000                     int chrAlpha= vChrFilter[2*dstY+1];
2001                     yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
2002                                    *chrVSrcPtr, *(chrVSrcPtr+1),
2003                                    alpPixBuf ? *alpSrcPtr : NULL,
2004                                    dest, dstW, chrAlpha, dstFormat, flags, dstY);
2005                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2006                     int lumAlpha= vLumFilter[2*dstY+1];
2007                     int chrAlpha= vChrFilter[2*dstY+1];
2008                     lumMmxFilter[2]=
2009                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2010                     chrMmxFilter[2]=
2011                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2012                     yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
2013                                    *chrVSrcPtr, *(chrVSrcPtr+1),
2014                                    alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2015                                    dest, dstW, lumAlpha, chrAlpha, dstY);
2016                 } else { //general RGB
2017                     yuv2packedX(c,
2018                                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2019                                    vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2020                                    alpSrcPtr, dest, dstW, dstY);
2021                 }
2022             }
2023         }
2024     }
2025
2026     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2027         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2028
2029 #if HAVE_MMX2
2030     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2031         __asm__ volatile("sfence":::"memory");
2032 #endif
2033     emms_c();
2034
2035     /* store changed local vars back in the context */
2036     c->dstY= dstY;
2037     c->lumBufIndex= lumBufIndex;
2038     c->chrBufIndex= chrBufIndex;
2039     c->lastInLumBuf= lastInLumBuf;
2040     c->lastInChrBuf= lastInChrBuf;
2041
2042     return dstY - lastDstY;
2043 }
2044
2045 static av_cold void sws_init_swScale_c(SwsContext *c)
2046 {
2047     enum PixelFormat srcFormat = c->srcFormat;
2048
2049     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2050                                    &c->yuv2packed1, &c->yuv2packed2,
2051                                    &c->yuv2packedX);
2052
2053     c->hScale       = hScale_c;
2054
2055     if (c->flags & SWS_FAST_BILINEAR) {
2056         c->hyscale_fast = hyscale_fast_c;
2057         c->hcscale_fast = hcscale_fast_c;
2058     }
2059
2060     c->chrToYV12 = NULL;
2061     switch(srcFormat) {
2062         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2063         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2064         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2065         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2066         case PIX_FMT_RGB8     :
2067         case PIX_FMT_BGR8     :
2068         case PIX_FMT_PAL8     :
2069         case PIX_FMT_BGR4_BYTE:
2070         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2071         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
2072         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
2073         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
2074         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
2075         case PIX_FMT_YUV420P16BE:
2076         case PIX_FMT_YUV422P16BE:
2077         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
2078         case PIX_FMT_YUV420P16LE:
2079         case PIX_FMT_YUV422P16LE:
2080         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
2081     }
2082     if (c->chrSrcHSubSample) {
2083         switch(srcFormat) {
2084         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2085         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2086         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2087         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2088         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2089         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2090         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2091         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2092         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2093         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2094         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2095         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2096         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2097         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2098         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2099         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2100         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2101         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2102         }
2103     } else {
2104         switch(srcFormat) {
2105         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2106         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2107         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2108         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2109         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2110         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2111         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2112         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2113         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2114         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2115         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2116         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2117         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2118         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2119         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2120         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2121         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2122         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2123         }
2124     }
2125
2126     c->lumToYV12 = NULL;
2127     c->alpToYV12 = NULL;
2128     switch (srcFormat) {
2129     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
2130     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
2131     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
2132     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
2133     case PIX_FMT_YUYV422  :
2134     case PIX_FMT_YUV420P16BE:
2135     case PIX_FMT_YUV422P16BE:
2136     case PIX_FMT_YUV444P16BE:
2137     case PIX_FMT_Y400A    :
2138     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
2139     case PIX_FMT_UYVY422  :
2140     case PIX_FMT_YUV420P16LE:
2141     case PIX_FMT_YUV422P16LE:
2142     case PIX_FMT_YUV444P16LE:
2143     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c;    break;
2144     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2145     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2146     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2147     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2148     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2149     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2150     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2151     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2152     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2153     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2154     case PIX_FMT_RGB8     :
2155     case PIX_FMT_BGR8     :
2156     case PIX_FMT_PAL8     :
2157     case PIX_FMT_BGR4_BYTE:
2158     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2159     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2160     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2161     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2162     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2163     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2164     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2165     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2166     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2167     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2168     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2169     }
2170     if (c->alpPixBuf) {
2171         switch (srcFormat) {
2172         case PIX_FMT_BGRA:
2173         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2174         case PIX_FMT_ABGR:
2175         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2176         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2177         }
2178     }
2179
2180     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2181         if (c->srcRange) {
2182             c->lumConvertRange = lumRangeFromJpeg_c;
2183             c->chrConvertRange = chrRangeFromJpeg_c;
2184         } else {
2185             c->lumConvertRange = lumRangeToJpeg_c;
2186             c->chrConvertRange = chrRangeToJpeg_c;
2187         }
2188     }
2189
2190     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2191           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2192         c->needs_hcscale = 1;
2193 }
2194
2195 SwsFunc ff_getSwsFunc(SwsContext *c)
2196 {
2197     sws_init_swScale_c(c);
2198
2199     if (HAVE_MMX)
2200         ff_sws_init_swScale_mmx(c);
2201     if (HAVE_ALTIVEC)
2202         ff_sws_init_swScale_altivec(c);
2203
2204     return swScale;
2205 }