git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 201                       int chrFilterSize, const int16_t **alpSrc,
 202                       uint16_t *dest, uint16_t *uDest, uint16_t *vDest,
 203                       uint16_t *aDest, int dstW, int chrDstW,
 204                       int big_endian, int output_bits)
 205 {
 206     //FIXME Optimize (just quickly written not optimized..)
 207     int i;
 208     int shift = 11 + 16 - output_bits;
 209
 210 #define output_pixel(pos, val) \
 211     if (big_endian) { \
 212         if (output_bits == 16) { \
 213             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 214         } else { \
 215             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 216         } \
 217     } else { \
 218         if (output_bits == 16) { \
 219             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 220         } else { \
 221             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 222         } \
 223     }
 224     for (i = 0; i < dstW; i++) {
 225         int val = 1 << (26-output_bits);
 226         int j;
 227
 228         for (j = 0; j < lumFilterSize; j++)
 229             val += lumSrc[j][i] * lumFilter[j];
 230
 231         output_pixel(&dest[i], val);
 232     }
 233
 234     if (uDest) {
 235         for (i = 0; i < chrDstW; i++) {
 236             int u = 1 << (26-output_bits);
 237             int v = 1 << (26-output_bits);
 238             int j;
 239
 240             for (j = 0; j < chrFilterSize; j++) {
 241                 u += chrUSrc[j][i] * chrFilter[j];
 242                 v += chrVSrc[j][i] * chrFilter[j];
 243             }
 244
 245             output_pixel(&uDest[i], u);
 246             output_pixel(&vDest[i], v);
 247         }
 248     }
 249
 250     if (CONFIG_SWSCALE_ALPHA && aDest) {
 251         for (i = 0; i < dstW; i++) {
 252             int val = 1 << (26-output_bits);
 253             int j;
 254
 255             for (j = 0; j < lumFilterSize; j++)
 256                 val += alpSrc[j][i] * lumFilter[j];
 257
 258             output_pixel(&aDest[i], val);
 259         }
 260     }
 261 #undef output_pixel
 262 }
 263
 264 #define yuv2NBPS(bits, BE_LE, is_be) \
 265 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 266                               const int16_t **lumSrc, int lumFilterSize, \
 267                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 268                               const int16_t **chrVSrc, \
 269                               int chrFilterSize, const int16_t **alpSrc, \
 270                               uint8_t *_dest, uint8_t *_uDest, uint8_t *_vDest, \
 271                               uint8_t *_aDest, int dstW, int chrDstW) \
 272 { \
 273     uint16_t *dest  = (uint16_t *) _dest,  *uDest = (uint16_t *) _uDest, \
 274              *vDest = (uint16_t *) _vDest, *aDest = (uint16_t *) _aDest; \
 275     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 276                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 277                           alpSrc, \
 278                           dest, uDest, vDest, aDest, \
 279                           dstW, chrDstW, is_be, bits); \
 280 }
 281 yuv2NBPS( 9, BE, 1);
 282 yuv2NBPS( 9, LE, 0);
 283 yuv2NBPS(10, BE, 1);
 284 yuv2NBPS(10, LE, 0);
 285 yuv2NBPS(16, BE, 1);
 286 yuv2NBPS(16, LE, 0);
 287
 288 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 289                        const int16_t **lumSrc, int lumFilterSize,
 290                        const int16_t *chrFilter, const int16_t **chrUSrc,
 291                        const int16_t **chrVSrc,
 292                        int chrFilterSize, const int16_t **alpSrc,
 293                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 294                        uint8_t *aDest, int dstW, int chrDstW)
 295 {
 296     //FIXME Optimize (just quickly written not optimized..)
 297     int i;
 298     for (i=0; i<dstW; i++) {
 299         int val=1<<18;
 300         int j;
 301         for (j=0; j<lumFilterSize; j++)
 302             val += lumSrc[j][i] * lumFilter[j];
 303
 304         dest[i]= av_clip_uint8(val>>19);
 305     }
 306
 307     if (uDest)
 308         for (i=0; i<chrDstW; i++) {
 309             int u=1<<18;
 310             int v=1<<18;
 311             int j;
 312             for (j=0; j<chrFilterSize; j++) {
 313                 u += chrUSrc[j][i] * chrFilter[j];
 314                 v += chrVSrc[j][i] * chrFilter[j];
 315             }
 316
 317             uDest[i]= av_clip_uint8(u>>19);
 318             vDest[i]= av_clip_uint8(v>>19);
 319         }
 320
 321     if (CONFIG_SWSCALE_ALPHA && aDest)
 322         for (i=0; i<dstW; i++) {
 323             int val=1<<18;
 324             int j;
 325             for (j=0; j<lumFilterSize; j++)
 326                 val += alpSrc[j][i] * lumFilter[j];
 327
 328             aDest[i]= av_clip_uint8(val>>19);
 329         }
 330 }
 331
 332 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 333                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 334                        const int16_t *alpSrc,
 335                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 336                        uint8_t *aDest, int dstW, int chrDstW)
 337 {
 338     int i;
 339     for (i=0; i<dstW; i++) {
 340         int val= (lumSrc[i]+64)>>7;
 341         dest[i]= av_clip_uint8(val);
 342     }
 343
 344     if (uDest)
 345         for (i=0; i<chrDstW; i++) {
 346             int u=(chrUSrc[i]+64)>>7;
 347             int v=(chrVSrc[i]+64)>>7;
 348             uDest[i]= av_clip_uint8(u);
 349             vDest[i]= av_clip_uint8(v);
 350         }
 351
 352     if (CONFIG_SWSCALE_ALPHA && aDest)
 353         for (i=0; i<dstW; i++) {
 354             int val= (alpSrc[i]+64)>>7;
 355             aDest[i]= av_clip_uint8(val);
 356         }
 357 }
 358
 359 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 360                         const int16_t **lumSrc, int lumFilterSize,
 361                         const int16_t *chrFilter, const int16_t **chrUSrc,
 362                         const int16_t **chrVSrc, int chrFilterSize,
 363                         const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest,
 364                         uint8_t *vDest, uint8_t *aDest,
 365                         int dstW, int chrDstW)
 366 {
 367     enum PixelFormat dstFormat = c->dstFormat;
 368
 369     //FIXME Optimize (just quickly written not optimized..)
 370     int i;
 371     for (i=0; i<dstW; i++) {
 372         int val=1<<18;
 373         int j;
 374         for (j=0; j<lumFilterSize; j++)
 375             val += lumSrc[j][i] * lumFilter[j];
 376
 377         dest[i]= av_clip_uint8(val>>19);
 378     }
 379
 380     if (!uDest)
 381         return;
 382
 383     if (dstFormat == PIX_FMT_NV12)
 384         for (i=0; i<chrDstW; i++) {
 385             int u=1<<18;
 386             int v=1<<18;
 387             int j;
 388             for (j=0; j<chrFilterSize; j++) {
 389                 u += chrUSrc[j][i] * chrFilter[j];
 390                 v += chrVSrc[j][i] * chrFilter[j];
 391             }
 392
 393             uDest[2*i]= av_clip_uint8(u>>19);
 394             uDest[2*i+1]= av_clip_uint8(v>>19);
 395         }
 396     else
 397         for (i=0; i<chrDstW; i++) {
 398             int u=1<<18;
 399             int v=1<<18;
 400             int j;
 401             for (j=0; j<chrFilterSize; j++) {
 402                 u += chrUSrc[j][i] * chrFilter[j];
 403                 v += chrVSrc[j][i] * chrFilter[j];
 404             }
 405
 406             uDest[2*i]= av_clip_uint8(v>>19);
 407             uDest[2*i+1]= av_clip_uint8(u>>19);
 408         }
 409 }
 410
 411 static av_always_inline void
 412 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 413                         const int16_t **lumSrc, int lumFilterSize,
 414                         const int16_t *chrFilter, const int16_t **chrUSrc,
 415                         const int16_t **chrVSrc, int chrFilterSize,
 416                         const int16_t **alpSrc, uint8_t *dest, int dstW,
 417                         int y, enum PixelFormat target)
 418 {
 419     int i;
 420
 421 #define output_pixel(pos, val) \
 422         if (target == PIX_FMT_GRAY16BE) { \
 423             AV_WB16(pos, val); \
 424         } else { \
 425             AV_WL16(pos, val); \
 426         }
 427     for (i = 0; i < (dstW >> 1); i++) {
 428         int j;
 429         int Y1 = 1 << 18;
 430         int Y2 = 1 << 18;
 431         const int i2 = 2 * i;
 432
 433         for (j = 0; j < lumFilterSize; j++) {
 434             Y1 += lumSrc[j][i2]   * lumFilter[j];
 435             Y2 += lumSrc[j][i2+1] * lumFilter[j];
 436         }
 437         Y1 >>= 11;
 438         Y2 >>= 11;
 439         if ((Y1 | Y2) & 0x10000) {
 440             Y1 = av_clip_uint16(Y1);
 441             Y2 = av_clip_uint16(Y2);
 442         }
 443         output_pixel(&dest[2 * i2 + 0], Y1);
 444         output_pixel(&dest[2 * i2 + 2], Y2);
 445     }
 446 }
 447
 448 static av_always_inline void
 449 yuv2gray16_2_c_template(SwsContext *c, const uint16_t *buf0,
 450                         const uint16_t *buf1, const uint16_t *ubuf0,
 451                         const uint16_t *ubuf1, const uint16_t *vbuf0,
 452                         const uint16_t *vbuf1, const uint16_t *abuf0,
 453                         const uint16_t *abuf1, uint8_t *dest, int dstW,
 454                         int yalpha, int uvalpha, int y,
 455                         enum PixelFormat target)
 456 {
 457     int  yalpha1 = 4095 - yalpha; \
 458     int i;
 459
 460     for (i = 0; i < (dstW >> 1); i++) {
 461         const int i2 = 2 * i;
 462         int Y1 = (buf0[i2  ] * yalpha1 + buf1[i2  ] * yalpha) >> 11;
 463         int Y2 = (buf0[i2+1] * yalpha1 + buf1[i2+1] * yalpha) >> 11;
 464
 465         output_pixel(&dest[2 * i2 + 0], Y1);
 466         output_pixel(&dest[2 * i2 + 2], Y2);
 467     }
 468 }
 469
 470 static av_always_inline void
 471 yuv2gray16_1_c_template(SwsContext *c, const uint16_t *buf0,
 472                         const uint16_t *ubuf0, const uint16_t *ubuf1,
 473                         const uint16_t *vbuf0, const uint16_t *vbuf1,
 474                         const uint16_t *abuf0, uint8_t *dest, int dstW,
 475                         int uvalpha, enum PixelFormat dstFormat,
 476                         int flags, int y, enum PixelFormat target)
 477 {
 478     int i;
 479
 480     for (i = 0; i < (dstW >> 1); i++) {
 481         const int i2 = 2 * i;
 482         int Y1 = buf0[i2  ] << 1;
 483         int Y2 = buf0[i2+1] << 1;
 484
 485         output_pixel(&dest[2 * i2 + 0], Y1);
 486         output_pixel(&dest[2 * i2 + 2], Y2);
 487     }
 488 #undef output_pixel
 489 }
 490
 491 #define YUV2PACKEDWRAPPER(name, ext, fmt) \
 492 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 493                         const int16_t **lumSrc, int lumFilterSize, \
 494                         const int16_t *chrFilter, const int16_t **chrUSrc, \
 495                         const int16_t **chrVSrc, int chrFilterSize, \
 496                         const int16_t **alpSrc, uint8_t *dest, int dstW, \
 497                         int y) \
 498 { \
 499     name ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 500                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 501                           alpSrc, dest, dstW, y, fmt); \
 502 } \
 503  \
 504 static void name ## ext ## _2_c(SwsContext *c, const uint16_t *buf0, \
 505                         const uint16_t *buf1, const uint16_t *ubuf0, \
 506                         const uint16_t *ubuf1, const uint16_t *vbuf0, \
 507                         const uint16_t *vbuf1, const uint16_t *abuf0, \
 508                         const uint16_t *abuf1, uint8_t *dest, int dstW, \
 509                         int yalpha, int uvalpha, int y) \
 510 { \
 511     name ## _2_c_template(c, buf0, buf1, ubuf0, ubuf1, \
 512                           vbuf0, vbuf1, abuf0, abuf1, \
 513                           dest, dstW, yalpha, uvalpha, y, fmt); \
 514 } \
 515  \
 516 static void name ## ext ## _1_c(SwsContext *c, const uint16_t *buf0, \
 517                         const uint16_t *ubuf0, const uint16_t *ubuf1, \
 518                         const uint16_t *vbuf0, const uint16_t *vbuf1, \
 519                         const uint16_t *abuf0, uint8_t *dest, int dstW, \
 520                         int uvalpha, enum PixelFormat dstFormat, \
 521                         int flags, int y) \
 522 { \
 523     name ## _1_c_template(c, buf0, ubuf0, ubuf1, vbuf0, \
 524                           vbuf1, abuf0, dest, dstW, uvalpha, \
 525                           dstFormat, flags, y, fmt); \
 526 }
 527
 528 YUV2PACKEDWRAPPER(yuv2gray16, LE, PIX_FMT_GRAY16LE);
 529 YUV2PACKEDWRAPPER(yuv2gray16, BE, PIX_FMT_GRAY16BE);
 530
 531 static av_always_inline void
 532 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 533                       const int16_t **lumSrc, int lumFilterSize,
 534                       const int16_t *chrFilter, const int16_t **chrUSrc,
 535                       const int16_t **chrVSrc, int chrFilterSize,
 536                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 537                       int y, enum PixelFormat target)
 538 {
 539     const uint8_t * const d128=dither_8x8_220[y&7];
 540     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 541     int i;
 542     int acc = 0;
 543
 544 #define output_pixel(pos, acc) \
 545     if (target == PIX_FMT_MONOBLACK) { \
 546         pos = acc; \
 547     } else { \
 548         pos = ~acc; \
 549     }
 550     for (i = 0; i < dstW - 1; i += 2) {
 551         int j;
 552         int Y1 = 1 << 18;
 553         int Y2 = 1 << 18;
 554
 555         for (j = 0; j < lumFilterSize; j++) {
 556             Y1 += lumSrc[j][i]   * lumFilter[j];
 557             Y2 += lumSrc[j][i+1] * lumFilter[j];
 558         }
 559         Y1 >>= 19;
 560         Y2 >>= 19;
 561         if ((Y1 | Y2) & 0x100) {
 562             Y1 = av_clip_uint8(Y1);
 563             Y2 = av_clip_uint8(Y2);
 564         }
 565         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 566         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 567         if ((i & 7) == 6) {
 568             output_pixel(*dest++, acc);
 569         }
 570     }
 571 }
 572
 573 static av_always_inline void
 574 yuv2mono_2_c_template(SwsContext *c, const uint16_t *buf0,
 575                       const uint16_t *buf1, const uint16_t *ubuf0,
 576                       const uint16_t *ubuf1, const uint16_t *vbuf0,
 577                       const uint16_t *vbuf1, const uint16_t *abuf0,
 578                       const uint16_t *abuf1, uint8_t *dest, int dstW,
 579                       int yalpha, int uvalpha, int y,
 580                       enum PixelFormat target)
 581 {
 582     const uint8_t * const d128 = dither_8x8_220[y & 7];
 583     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 584     int  yalpha1 = 4095 - yalpha;
 585     int i;
 586
 587     for (i = 0; i < dstW - 7; i += 8) {
 588         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 589         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 590         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 591         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 592         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 593         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 594         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 595         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 596         output_pixel(*dest++, acc);
 597     }
 598 }
 599
 600 static av_always_inline void
 601 yuv2mono_1_c_template(SwsContext *c, const uint16_t *buf0,
 602                       const uint16_t *ubuf0, const uint16_t *ubuf1,
 603                       const uint16_t *vbuf0, const uint16_t *vbuf1,
 604                       const uint16_t *abuf0, uint8_t *dest, int dstW,
 605                       int uvalpha, enum PixelFormat dstFormat,
 606                       int flags, int y, enum PixelFormat target)
 607 {
 608     const uint8_t * const d128 = dither_8x8_220[y & 7];
 609     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 610     int i;
 611
 612     for (i = 0; i < dstW - 7; i += 8) {
 613         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 614         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 615         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 616         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 617         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 618         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 619         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 620         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 621         output_pixel(*dest++, acc);
 622     }
 623 #undef output_pixel
 624 }
 625
 626 YUV2PACKEDWRAPPER(yuv2mono, white, PIX_FMT_MONOWHITE);
 627 YUV2PACKEDWRAPPER(yuv2mono, black, PIX_FMT_MONOBLACK);
 628
 629 static av_always_inline void
 630 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 631                      const int16_t **lumSrc, int lumFilterSize,
 632                      const int16_t *chrFilter, const int16_t **chrUSrc,
 633                      const int16_t **chrVSrc, int chrFilterSize,
 634                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 635                      int y, enum PixelFormat target)
 636 {
 637     int i;
 638
 639 #define output_pixels(pos, Y1, U, Y2, V) \
 640     if (target == PIX_FMT_YUYV422) { \
 641         dest[pos + 0] = Y1; \
 642         dest[pos + 1] = U;  \
 643         dest[pos + 2] = Y2; \
 644         dest[pos + 3] = V;  \
 645     } else { \
 646         dest[pos + 0] = U;  \
 647         dest[pos + 1] = Y1; \
 648         dest[pos + 2] = V;  \
 649         dest[pos + 3] = Y2; \
 650     }
 651
 652     for (i = 0; i < (dstW >> 1); i++) {
 653         int j;
 654         int Y1 = 1 << 18;
 655         int Y2 = 1 << 18;
 656         int U  = 1 << 18;
 657         int V  = 1 << 18;
 658
 659         for (j = 0; j < lumFilterSize; j++) {
 660             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 661             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 662         }
 663         for (j = 0; j < chrFilterSize; j++) {
 664             U += chrUSrc[j][i] * chrFilter[j];
 665             V += chrVSrc[j][i] * chrFilter[j];
 666         }
 667         Y1 >>= 19;
 668         Y2 >>= 19;
 669         U  >>= 19;
 670         V  >>= 19;
 671         if ((Y1 | Y2 | U | V) & 0x100) {
 672             Y1 = av_clip_uint8(Y1);
 673             Y2 = av_clip_uint8(Y2);
 674             U  = av_clip_uint8(U);
 675             V  = av_clip_uint8(V);
 676         }
 677         output_pixels(4*i, Y1, U, Y2, V);
 678     }
 679 }
 680
 681 static av_always_inline void
 682 yuv2422_2_c_template(SwsContext *c, const uint16_t *buf0,
 683                      const uint16_t *buf1, const uint16_t *ubuf0,
 684                      const uint16_t *ubuf1, const uint16_t *vbuf0,
 685                      const uint16_t *vbuf1, const uint16_t *abuf0,
 686                      const uint16_t *abuf1, uint8_t *dest, int dstW,
 687                      int yalpha, int uvalpha, int y,
 688                      enum PixelFormat target)
 689 {
 690     int  yalpha1 = 4095 - yalpha;
 691     int uvalpha1 = 4095 - uvalpha;
 692     int i;
 693
 694     for (i = 0; i < (dstW >> 1); i++) {
 695         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 696         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 697         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 698         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 699
 700         output_pixels(i * 4, Y1, U, Y2, V);
 701     }
 702 }
 703
 704 static av_always_inline void
 705 yuv2422_1_c_template(SwsContext *c, const uint16_t *buf0,
 706                      const uint16_t *ubuf0, const uint16_t *ubuf1,
 707                      const uint16_t *vbuf0, const uint16_t *vbuf1,
 708                      const uint16_t *abuf0, uint8_t *dest, int dstW,
 709                      int uvalpha, enum PixelFormat dstFormat,
 710                      int flags, int y, enum PixelFormat target)
 711 {
 712     int i;
 713
 714     if (uvalpha < 2048) {
 715         for (i = 0; i < (dstW >> 1); i++) {
 716             int Y1 = buf0[i * 2]     >> 7;
 717             int Y2 = buf0[i * 2 + 1] >> 7;
 718             int U  = ubuf1[i]        >> 7;
 719             int V  = vbuf1[i]        >> 7;
 720
 721             output_pixels(i * 4, Y1, U, Y2, V);
 722         }
 723     } else {
 724         for (i = 0; i < (dstW >> 1); i++) {
 725             int Y1 =  buf0[i * 2]          >> 7;
 726             int Y2 =  buf0[i * 2 + 1]      >> 7;
 727             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 728             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 729
 730             output_pixels(i * 4, Y1, U, Y2, V);
 731         }
 732     }
 733 #undef output_pixels
 734 }
 735
 736 YUV2PACKEDWRAPPER(yuv2422, yuyv, PIX_FMT_YUYV422);
 737 YUV2PACKEDWRAPPER(yuv2422, uyvy, PIX_FMT_UYVY422);
 738
 739 #define YSCALE_YUV_2_RGBX_C(type,alpha) \
 740     for (i=0; i<(dstW>>1); i++) {\
 741         int j;\
 742         int Y1 = 1<<18;\
 743         int Y2 = 1<<18;\
 744         int U  = 1<<18;\
 745         int V  = 1<<18;\
 746         int av_unused A1, A2;\
 747         type av_unused *r, *b, *g;\
 748         const int i2= 2*i;\
 749         \
 750         for (j=0; j<lumFilterSize; j++) {\
 751             Y1 += lumSrc[j][i2] * lumFilter[j];\
 752             Y2 += lumSrc[j][i2+1] * lumFilter[j];\
 753         }\
 754         for (j=0; j<chrFilterSize; j++) {\
 755             U += chrUSrc[j][i] * chrFilter[j];\
 756             V += chrVSrc[j][i] * chrFilter[j];\
 757         }\
 758         Y1>>=19;\
 759         Y2>>=19;\
 760         U >>=19;\
 761         V >>=19;\
 762         if ((Y1|Y2|U|V)&0x100) {\
 763             Y1 = av_clip_uint8(Y1); \
 764             Y2 = av_clip_uint8(Y2); \
 765             U  = av_clip_uint8(U); \
 766             V  = av_clip_uint8(V); \
 767         }\
 768         if (alpha) {\
 769             A1 = 1<<18;\
 770             A2 = 1<<18;\
 771             for (j=0; j<lumFilterSize; j++) {\
 772                 A1 += alpSrc[j][i2  ] * lumFilter[j];\
 773                 A2 += alpSrc[j][i2+1] * lumFilter[j];\
 774             }\
 775             A1>>=19;\
 776             A2>>=19;\
 777             if ((A1|A2)&0x100) {\
 778                 A1 = av_clip_uint8(A1); \
 779                 A2 = av_clip_uint8(A2); \
 780             }\
 781         }\
 782         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
 783     r = (type *)c->table_rV[V];   \
 784     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
 785     b = (type *)c->table_bU[U];
 786
 787 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
 788     for (i=0; i<dstW; i++) {\
 789         int j;\
 790         int Y = 0;\
 791         int U = -128<<19;\
 792         int V = -128<<19;\
 793         int av_unused A;\
 794         int R,G,B;\
 795         \
 796         for (j=0; j<lumFilterSize; j++) {\
 797             Y += lumSrc[j][i     ] * lumFilter[j];\
 798         }\
 799         for (j=0; j<chrFilterSize; j++) {\
 800             U += chrUSrc[j][i] * chrFilter[j];\
 801             V += chrVSrc[j][i] * chrFilter[j];\
 802         }\
 803         Y >>=10;\
 804         U >>=10;\
 805         V >>=10;\
 806         if (alpha) {\
 807             A = rnd;\
 808             for (j=0; j<lumFilterSize; j++)\
 809                 A += alpSrc[j][i     ] * lumFilter[j];\
 810             A >>=19;\
 811             if (A&0x100)\
 812                 A = av_clip_uint8(A);\
 813         }\
 814         Y-= c->yuv2rgb_y_offset;\
 815         Y*= c->yuv2rgb_y_coeff;\
 816         Y+= rnd;\
 817         R= Y + V*c->yuv2rgb_v2r_coeff;\
 818         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
 819         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
 820         if ((R|G|B)&(0xC0000000)) {\
 821             R = av_clip_uintp2(R, 30); \
 822             G = av_clip_uintp2(G, 30); \
 823             B = av_clip_uintp2(B, 30); \
 824         }
 825
 826 #define YSCALE_YUV_2_RGB2_C(type,alpha) \
 827     for (i=0; i<(dstW>>1); i++) { \
 828         const int i2= 2*i;       \
 829         int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>19;           \
 830         int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>19;           \
 831         int U= (ubuf0[i]*uvalpha1+ubuf1[i]*uvalpha)>>19;              \
 832         int V= (vbuf0[i]*uvalpha1+vbuf1[i]*uvalpha)>>19;              \
 833         type av_unused *r, *b, *g;                                    \
 834         int av_unused A1, A2;                                         \
 835         if (alpha) {\
 836             A1= (abuf0[i2  ]*yalpha1+abuf1[i2  ]*yalpha)>>19;         \
 837             A2= (abuf0[i2+1]*yalpha1+abuf1[i2+1]*yalpha)>>19;         \
 838         }\
 839     r = (type *)c->table_rV[V];\
 840     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 841     b = (type *)c->table_bU[U];
 842
 843 #define YSCALE_YUV_2_RGB1_C(type,alpha) \
 844     for (i=0; i<(dstW>>1); i++) {\
 845         const int i2= 2*i;\
 846         int Y1= buf0[i2  ]>>7;\
 847         int Y2= buf0[i2+1]>>7;\
 848         int U= (ubuf1[i])>>7;\
 849         int V= (vbuf1[i])>>7;\
 850         type av_unused *r, *b, *g;\
 851         int av_unused A1, A2;\
 852         if (alpha) {\
 853             A1= abuf0[i2  ]>>7;\
 854             A2= abuf0[i2+1]>>7;\
 855         }\
 856     r = (type *)c->table_rV[V];\
 857     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 858     b = (type *)c->table_bU[U];
 859
 860 #define YSCALE_YUV_2_RGB1B_C(type,alpha) \
 861     for (i=0; i<(dstW>>1); i++) {\
 862         const int i2= 2*i;\
 863         int Y1= buf0[i2  ]>>7;\
 864         int Y2= buf0[i2+1]>>7;\
 865         int U= (ubuf0[i] + ubuf1[i])>>8;\
 866         int V= (vbuf0[i] + vbuf1[i])>>8;\
 867         type av_unused *r, *b, *g;\
 868         int av_unused A1, A2;\
 869         if (alpha) {\
 870             A1= abuf0[i2  ]>>7;\
 871             A2= abuf0[i2+1]>>7;\
 872         }\
 873     r = (type *)c->table_rV[V];\
 874     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
 875     b = (type *)c->table_bU[U];
 876
 877 #define YSCALE_YUV_2_ANYRGB_C(func)\
 878     switch(c->dstFormat) {\
 879     case PIX_FMT_RGB48BE:\
 880     case PIX_FMT_RGB48LE:\
 881         func(uint8_t,0)\
 882             ((uint8_t*)dest)[ 0]= r[Y1];\
 883             ((uint8_t*)dest)[ 1]= r[Y1];\
 884             ((uint8_t*)dest)[ 2]= g[Y1];\
 885             ((uint8_t*)dest)[ 3]= g[Y1];\
 886             ((uint8_t*)dest)[ 4]= b[Y1];\
 887             ((uint8_t*)dest)[ 5]= b[Y1];\
 888             ((uint8_t*)dest)[ 6]= r[Y2];\
 889             ((uint8_t*)dest)[ 7]= r[Y2];\
 890             ((uint8_t*)dest)[ 8]= g[Y2];\
 891             ((uint8_t*)dest)[ 9]= g[Y2];\
 892             ((uint8_t*)dest)[10]= b[Y2];\
 893             ((uint8_t*)dest)[11]= b[Y2];\
 894             dest+=12;\
 895         }\
 896         break;\
 897     case PIX_FMT_BGR48BE:\
 898     case PIX_FMT_BGR48LE:\
 899         func(uint8_t,0)\
 900             ((uint8_t*)dest)[ 0] = ((uint8_t*)dest)[ 1] = b[Y1];\
 901             ((uint8_t*)dest)[ 2] = ((uint8_t*)dest)[ 3] = g[Y1];\
 902             ((uint8_t*)dest)[ 4] = ((uint8_t*)dest)[ 5] = r[Y1];\
 903             ((uint8_t*)dest)[ 6] = ((uint8_t*)dest)[ 7] = b[Y2];\
 904             ((uint8_t*)dest)[ 8] = ((uint8_t*)dest)[ 9] = g[Y2];\
 905             ((uint8_t*)dest)[10] = ((uint8_t*)dest)[11] = r[Y2];\
 906             dest+=12;\
 907         }\
 908         break;\
 909     case PIX_FMT_RGBA:\
 910     case PIX_FMT_BGRA:\
 911         if (CONFIG_SMALL) {\
 912             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 913             func(uint32_t,needAlpha)\
 914                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? (A1<<24) : 0);\
 915                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? (A2<<24) : 0);\
 916             }\
 917         } else {\
 918             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 919                 func(uint32_t,1)\
 920                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (A1<<24);\
 921                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (A2<<24);\
 922                 }\
 923             } else {\
 924                 func(uint32_t,0)\
 925                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 926                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 927                 }\
 928             }\
 929         }\
 930         break;\
 931     case PIX_FMT_ARGB:\
 932     case PIX_FMT_ABGR:\
 933         if (CONFIG_SMALL) {\
 934             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;\
 935             func(uint32_t,needAlpha)\
 936                 ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + (needAlpha ? A1 : 0);\
 937                 ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + (needAlpha ? A2 : 0);\
 938             }\
 939         } else {\
 940             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {\
 941                 func(uint32_t,1)\
 942                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1] + A1;\
 943                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2] + A2;\
 944                 }\
 945             } else {\
 946                 func(uint32_t,0)\
 947                     ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
 948                     ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
 949                 }\
 950             }\
 951         }                \
 952         break;\
 953     case PIX_FMT_RGB24:\
 954         func(uint8_t,0)\
 955             ((uint8_t*)dest)[0]= r[Y1];\
 956             ((uint8_t*)dest)[1]= g[Y1];\
 957             ((uint8_t*)dest)[2]= b[Y1];\
 958             ((uint8_t*)dest)[3]= r[Y2];\
 959             ((uint8_t*)dest)[4]= g[Y2];\
 960             ((uint8_t*)dest)[5]= b[Y2];\
 961             dest+=6;\
 962         }\
 963         break;\
 964     case PIX_FMT_BGR24:\
 965         func(uint8_t,0)\
 966             ((uint8_t*)dest)[0]= b[Y1];\
 967             ((uint8_t*)dest)[1]= g[Y1];\
 968             ((uint8_t*)dest)[2]= r[Y1];\
 969             ((uint8_t*)dest)[3]= b[Y2];\
 970             ((uint8_t*)dest)[4]= g[Y2];\
 971             ((uint8_t*)dest)[5]= r[Y2];\
 972             dest+=6;\
 973         }\
 974         break;\
 975     case PIX_FMT_RGB565:\
 976     case PIX_FMT_BGR565:\
 977         {\
 978             const int dr1= dither_2x2_8[y&1    ][0];\
 979             const int dg1= dither_2x2_4[y&1    ][0];\
 980             const int db1= dither_2x2_8[(y&1)^1][0];\
 981             const int dr2= dither_2x2_8[y&1    ][1];\
 982             const int dg2= dither_2x2_4[y&1    ][1];\
 983             const int db2= dither_2x2_8[(y&1)^1][1];\
 984             func(uint16_t,0)\
 985                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
 986                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
 987             }\
 988         }\
 989         break;\
 990     case PIX_FMT_RGB555:\
 991     case PIX_FMT_BGR555:\
 992         {\
 993             const int dr1= dither_2x2_8[y&1    ][0];\
 994             const int dg1= dither_2x2_8[y&1    ][1];\
 995             const int db1= dither_2x2_8[(y&1)^1][0];\
 996             const int dr2= dither_2x2_8[y&1    ][1];\
 997             const int dg2= dither_2x2_8[y&1    ][0];\
 998             const int db2= dither_2x2_8[(y&1)^1][1];\
 999             func(uint16_t,0)\
1000                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
1001                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
1002             }\
1003         }\
1004         break;\
1005     case PIX_FMT_RGB444:\
1006     case PIX_FMT_BGR444:\
1007         {\
1008             const int dr1= dither_4x4_16[y&3    ][0];\
1009             const int dg1= dither_4x4_16[y&3    ][1];\
1010             const int db1= dither_4x4_16[(y&3)^3][0];\
1011             const int dr2= dither_4x4_16[y&3    ][1];\
1012             const int dg2= dither_4x4_16[y&3    ][0];\
1013             const int db2= dither_4x4_16[(y&3)^3][1];\
1014             func(uint16_t,0)\
1015                 ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];\
1016                 ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];\
1017             }\
1018         }\
1019         break;\
1020     case PIX_FMT_RGB8:\
1021     case PIX_FMT_BGR8:\
1022         {\
1023             const uint8_t * const d64= dither_8x8_73[y&7];\
1024             const uint8_t * const d32= dither_8x8_32[y&7];\
1025             func(uint8_t,0)\
1026                 ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];\
1027                 ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];\
1028             }\
1029         }\
1030         break;\
1031     case PIX_FMT_RGB4:\
1032     case PIX_FMT_BGR4:\
1033         {\
1034             const uint8_t * const d64= dither_8x8_73 [y&7];\
1035             const uint8_t * const d128=dither_8x8_220[y&7];\
1036             func(uint8_t,0)\
1037                 ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]\
1038                                  + ((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);\
1039             }\
1040         }\
1041         break;\
1042     case PIX_FMT_RGB4_BYTE:\
1043     case PIX_FMT_BGR4_BYTE:\
1044         {\
1045             const uint8_t * const d64= dither_8x8_73 [y&7];\
1046             const uint8_t * const d128=dither_8x8_220[y&7];\
1047             func(uint8_t,0)\
1048                 ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];\
1049                 ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];\
1050             }\
1051         }\
1052         break;\
1053     }
1054
1055 static void yuv2packedX_c(SwsContext *c, const int16_t *lumFilter,
1056                           const int16_t **lumSrc, int lumFilterSize,
1057                           const int16_t *chrFilter, const int16_t **chrUSrc,
1058                           const int16_t **chrVSrc, int chrFilterSize,
1059                           const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
1060 {
1061     int i;
1062     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C)
1063 }
1064
1065 static void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
1066                             const int16_t **lumSrc, int lumFilterSize,
1067                             const int16_t *chrFilter, const int16_t **chrUSrc,
1068                             const int16_t **chrVSrc, int chrFilterSize,
1069                             const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
1070 {
1071     int i;
1072     int step= c->dstFormatBpp/8;
1073     int aidx= 3;
1074
1075     switch(c->dstFormat) {
1076     case PIX_FMT_ARGB:
1077         dest++;
1078         aidx= 0;
1079     case PIX_FMT_RGB24:
1080         aidx--;
1081     case PIX_FMT_RGBA:
1082         if (CONFIG_SMALL) {
1083             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
1084             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
1085                 dest[aidx]= needAlpha ? A : 255;
1086                 dest[0]= R>>22;
1087                 dest[1]= G>>22;
1088                 dest[2]= B>>22;
1089                 dest+= step;
1090             }
1091         } else {
1092             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1093                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
1094                     dest[aidx]= A;
1095                     dest[0]= R>>22;
1096                     dest[1]= G>>22;
1097                     dest[2]= B>>22;
1098                     dest+= step;
1099                 }
1100             } else {
1101                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
1102                     dest[aidx]= 255;
1103                     dest[0]= R>>22;
1104                     dest[1]= G>>22;
1105                     dest[2]= B>>22;
1106                     dest+= step;
1107                 }
1108             }
1109         }
1110         break;
1111     case PIX_FMT_ABGR:
1112         dest++;
1113         aidx= 0;
1114     case PIX_FMT_BGR24:
1115         aidx--;
1116     case PIX_FMT_BGRA:
1117         if (CONFIG_SMALL) {
1118             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
1119             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
1120                 dest[aidx]= needAlpha ? A : 255;
1121                 dest[0]= B>>22;
1122                 dest[1]= G>>22;
1123                 dest[2]= R>>22;
1124                 dest+= step;
1125             }
1126         } else {
1127             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1128                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
1129                     dest[aidx]= A;
1130                     dest[0]= B>>22;
1131                     dest[1]= G>>22;
1132                     dest[2]= R>>22;
1133                     dest+= step;
1134                 }
1135             } else {
1136                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
1137                     dest[aidx]= 255;
1138                     dest[0]= B>>22;
1139                     dest[1]= G>>22;
1140                     dest[2]= R>>22;
1141                     dest+= step;
1142                 }
1143             }
1144         }
1145         break;
1146     default:
1147         assert(0);
1148     }
1149 }
1150
1151 /**
1152  * vertical bilinear scale YV12 to RGB
1153  */
1154 static void yuv2packed2_c(SwsContext *c, const uint16_t *buf0,
1155                           const uint16_t *buf1, const uint16_t *ubuf0,
1156                           const uint16_t *ubuf1, const uint16_t *vbuf0,
1157                           const uint16_t *vbuf1, const uint16_t *abuf0,
1158                           const uint16_t *abuf1, uint8_t *dest, int dstW,
1159                           int yalpha, int uvalpha, int y)
1160 {
1161     int  yalpha1=4095- yalpha;
1162     int uvalpha1=4095-uvalpha;
1163     int i;
1164
1165     YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C)
1166 }
1167
1168 /**
1169  * YV12 to RGB without scaling or interpolating
1170  */
1171 static void yuv2packed1_c(SwsContext *c, const uint16_t *buf0,
1172                           const uint16_t *ubuf0, const uint16_t *ubuf1,
1173                           const uint16_t *vbuf0, const uint16_t *vbuf1,
1174                           const uint16_t *abuf0, uint8_t *dest, int dstW,
1175                           int uvalpha, enum PixelFormat dstFormat,
1176                           int flags, int y)
1177 {
1178     int i;
1179
1180     if (uvalpha < 2048) {
1181         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C)
1182     } else {
1183         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C)
1184     }
1185 }
1186
1187 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1188                                        int width, int height,
1189                                        int y, uint8_t val)
1190 {
1191     int i;
1192     uint8_t *ptr = plane + stride*y;
1193     for (i=0; i<height; i++) {
1194         memset(ptr, val, width);
1195         ptr += stride;
1196     }
1197 }
1198
1199 static av_always_inline void
1200 rgb48ToY_c_template(uint8_t *dst, const uint8_t *src, int width,
1201                     enum PixelFormat origin)
1202 {
1203     int i;
1204     for (i = 0; i < width; i++) {
1205 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1206         int a = input_pixel(&src[i*6+0]) >> 8;
1207         int g = input_pixel(&src[i*6+2]) >> 8;
1208         int c = input_pixel(&src[i*6+4]) >> 8;
1209
1210 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? c : a)
1211 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? a : c)
1212         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1213     }
1214 }
1215
1216 static av_always_inline void
1217 rgb48ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1218                     const uint8_t *src1, const uint8_t *src2,
1219                     int width, enum PixelFormat origin)
1220 {
1221     int i;
1222     assert(src1==src2);
1223     for (i = 0; i < width; i++) {
1224         int a = input_pixel(&src1[6*i + 0]) >> 8;
1225         int g = input_pixel(&src1[6*i + 2]) >> 8;
1226         int c = input_pixel(&src1[6*i + 4]) >> 8;
1227
1228         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1229         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1230     }
1231 }
1232
1233 static av_always_inline void
1234 rgb48ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1235                           const uint8_t *src1, const uint8_t *src2,
1236                           int width, enum PixelFormat origin)
1237 {
1238     int i;
1239     assert(src1==src2);
1240     for (i = 0; i < width; i++) {
1241         int a = (input_pixel(&src1[12*i + 0]) >> 8) + (input_pixel(&src1[12*i + 6]) >> 8);
1242         int g = (input_pixel(&src1[12*i + 2]) >> 8) + (input_pixel(&src1[12*i + 8]) >> 8);
1243         int c = (input_pixel(&src1[12*i + 4]) >> 8) + (input_pixel(&src1[12*i + 10]) >> 8);
1244
1245         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1246         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1247     }
1248 #undef r
1249 #undef b
1250 #undef input_pixel
1251 }
1252
1253 #define rgb48funcs(pattern, BE_LE, origin) \
1254 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *dst, const uint8_t *src, \
1255                                     int width, uint32_t *unused) \
1256 { \
1257     rgb48ToY_c_template(dst, src, width, origin); \
1258 } \
1259  \
1260 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1261                                     const uint8_t *src1, const uint8_t *src2, \
1262                                     int width, uint32_t *unused) \
1263 { \
1264     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1265 } \
1266  \
1267 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1268                                     const uint8_t *src1, const uint8_t *src2, \
1269                                     int width, uint32_t *unused) \
1270 { \
1271     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1272 }
1273
1274 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1275 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1276 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1277 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1278
1279 static av_always_inline void
1280 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1281                        int width, enum PixelFormat origin,
1282                        int shr,   int shg,   int shb, int shp,
1283                        int maskr, int maskg, int maskb,
1284                        int rsh,   int gsh,   int bsh, int S)
1285 {
1286     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1287               rnd = 33 << (S - 1);
1288     int i;
1289
1290     for (i = 0; i < width; i++) {
1291 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1292                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1293                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1294         int px = input_pixel(i) >> shp;
1295         int b = (px & maskb) >> shb;
1296         int g = (px & maskg) >> shg;
1297         int r = (px & maskr) >> shr;
1298
1299         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1300     }
1301 }
1302
1303 static av_always_inline void
1304 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1305                         const uint8_t *src, int width,
1306                         enum PixelFormat origin,
1307                         int shr,   int shg,   int shb, int shp,
1308                         int maskr, int maskg, int maskb,
1309                         int rsh,   int gsh,   int bsh, int S)
1310 {
1311     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1312               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1313               rnd = 257 << (S - 1);
1314     int i;
1315
1316     for (i = 0; i < width; i++) {
1317         int px = input_pixel(i) >> shp;
1318         int b = (px & maskb) >> shb;
1319         int g = (px & maskg) >> shg;
1320         int r = (px & maskr) >> shr;
1321
1322         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1323         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1324     }
1325 }
1326
1327 static av_always_inline void
1328 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1329                              const uint8_t *src, int width,
1330                              enum PixelFormat origin,
1331                              int shr,   int shg,   int shb, int shp,
1332                              int maskr, int maskg, int maskb,
1333                              int rsh,   int gsh,   int bsh, int S)
1334 {
1335     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1336               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1337               rnd = 257 << S, maskgx = ~(maskr | maskb);
1338     int i;
1339
1340     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1341     for (i = 0; i < width; i++) {
1342         int px0 = input_pixel(2 * i + 0) >> shp;
1343         int px1 = input_pixel(2 * i + 1) >> shp;
1344         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1345         int rb = px0 + px1 - g;
1346
1347         b = (rb & maskb) >> shb;
1348         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1349             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1350             g >>= shg;
1351         } else {
1352             g = (g  & maskg) >> shg;
1353         }
1354         r = (rb & maskr) >> shr;
1355
1356         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1357         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1358     }
1359 #undef input_pixel
1360 }
1361
1362 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1363                          maskg, maskb, rsh, gsh, bsh, S) \
1364 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1365                           int width, uint32_t *unused) \
1366 { \
1367     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1368                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1369 } \
1370  \
1371 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1372                            const uint8_t *src, const uint8_t *dummy, \
1373                            int width, uint32_t *unused) \
1374 { \
1375     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1376                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1377 } \
1378  \
1379 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1380                                 const uint8_t *src, const uint8_t *dummy, \
1381                                 int width, uint32_t *unused) \
1382 { \
1383     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1384                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1385 }
1386
1387 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1388 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1389 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1390 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1391 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1392 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1393 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1394 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1395 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1396 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1397 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1398 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1399
1400 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1401 {
1402     int i;
1403     for (i=0; i<width; i++) {
1404         dst[i]= src[4*i];
1405     }
1406 }
1407
1408 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1409 {
1410     int i;
1411     for (i=0; i<width; i++) {
1412         dst[i]= src[4*i+3];
1413     }
1414 }
1415
1416 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1417 {
1418     int i;
1419     for (i=0; i<width; i++) {
1420         int d= src[i];
1421
1422         dst[i]= pal[d] & 0xFF;
1423     }
1424 }
1425
1426 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1427                       const uint8_t *src1, const uint8_t *src2,
1428                       int width, uint32_t *pal)
1429 {
1430     int i;
1431     assert(src1 == src2);
1432     for (i=0; i<width; i++) {
1433         int p= pal[src1[i]];
1434
1435         dstU[i]= p>>8;
1436         dstV[i]= p>>16;
1437     }
1438 }
1439
1440 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1441                           int width, uint32_t *unused)
1442 {
1443     int i, j;
1444     for (i=0; i<width/8; i++) {
1445         int d= ~src[i];
1446         for(j=0; j<8; j++)
1447             dst[8*i+j]= ((d>>(7-j))&1)*255;
1448     }
1449 }
1450
1451 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1452                           int width, uint32_t *unused)
1453 {
1454     int i, j;
1455     for (i=0; i<width/8; i++) {
1456         int d= src[i];
1457         for(j=0; j<8; j++)
1458             dst[8*i+j]= ((d>>(7-j))&1)*255;
1459     }
1460 }
1461
1462 //FIXME yuy2* can read up to 7 samples too much
1463
1464 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1465                       uint32_t *unused)
1466 {
1467     int i;
1468     for (i=0; i<width; i++)
1469         dst[i]= src[2*i];
1470 }
1471
1472 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1473                        const uint8_t *src2, int width, uint32_t *unused)
1474 {
1475     int i;
1476     for (i=0; i<width; i++) {
1477         dstU[i]= src1[4*i + 1];
1478         dstV[i]= src1[4*i + 3];
1479     }
1480     assert(src1 == src2);
1481 }
1482
1483 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1484                      const uint8_t *src2, int width, uint32_t *unused)
1485 {
1486     int i;
1487     for (i=0; i<width; i++) {
1488         dstU[i]= src1[2*i + 1];
1489         dstV[i]= src2[2*i + 1];
1490     }
1491 }
1492
1493 /* This is almost identical to the previous, end exists only because
1494  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1495 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1496                       uint32_t *unused)
1497 {
1498     int i;
1499     for (i=0; i<width; i++)
1500         dst[i]= src[2*i+1];
1501 }
1502
1503 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1504                        const uint8_t *src2, int width, uint32_t *unused)
1505 {
1506     int i;
1507     for (i=0; i<width; i++) {
1508         dstU[i]= src1[4*i + 0];
1509         dstV[i]= src1[4*i + 2];
1510     }
1511     assert(src1 == src2);
1512 }
1513
1514 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1515                      const uint8_t *src2, int width, uint32_t *unused)
1516 {
1517     int i;
1518     for (i=0; i<width; i++) {
1519         dstU[i]= src1[2*i];
1520         dstV[i]= src2[2*i];
1521     }
1522 }
1523
1524 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1525                                         const uint8_t *src, int width)
1526 {
1527     int i;
1528     for (i = 0; i < width; i++) {
1529         dst1[i] = src[2*i+0];
1530         dst2[i] = src[2*i+1];
1531     }
1532 }
1533
1534 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1535                        const uint8_t *src1, const uint8_t *src2,
1536                        int width, uint32_t *unused)
1537 {
1538     nvXXtoUV_c(dstU, dstV, src1, width);
1539 }
1540
1541 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1542                        const uint8_t *src1, const uint8_t *src2,
1543                        int width, uint32_t *unused)
1544 {
1545     nvXXtoUV_c(dstV, dstU, src1, width);
1546 }
1547
1548 // FIXME Maybe dither instead.
1549 static av_always_inline void
1550 yuv9_OR_10ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1551                           const uint8_t *_srcU, const uint8_t *_srcV,
1552                           int width, enum PixelFormat origin, int depth)
1553 {
1554     int i;
1555     const uint16_t *srcU = (const uint16_t *) _srcU;
1556     const uint16_t *srcV = (const uint16_t *) _srcV;
1557
1558 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1559     for (i = 0; i < width; i++) {
1560         dstU[i] = input_pixel(&srcU[i]) >> (depth - 8);
1561         dstV[i] = input_pixel(&srcV[i]) >> (depth - 8);
1562     }
1563 }
1564
1565 static av_always_inline void
1566 yuv9_or_10ToY_c_template(uint8_t *dstY, const uint8_t *_srcY,
1567                          int width, enum PixelFormat origin, int depth)
1568 {
1569     int i;
1570     const uint16_t *srcY = (const uint16_t*)_srcY;
1571
1572     for (i = 0; i < width; i++)
1573         dstY[i] = input_pixel(&srcY[i]) >> (depth - 8);
1574 #undef input_pixel
1575 }
1576
1577 #define YUV_NBPS(depth, BE_LE, origin) \
1578 static void BE_LE ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1579                                      const uint8_t *srcU, const uint8_t *srcV, \
1580                                      int width, uint32_t *unused) \
1581 { \
1582     yuv9_OR_10ToUV_c_template(dstU, dstV, srcU, srcV, width, origin, depth); \
1583 } \
1584 static void BE_LE ## depth ## ToY_c(uint8_t *dstY, const uint8_t *srcY, \
1585                                     int width, uint32_t *unused) \
1586 { \
1587     yuv9_or_10ToY_c_template(dstY, srcY, width, origin, depth); \
1588 }
1589
1590 YUV_NBPS( 9, LE, PIX_FMT_YUV420P9LE);
1591 YUV_NBPS( 9, BE, PIX_FMT_YUV420P9BE);
1592 YUV_NBPS(10, LE, PIX_FMT_YUV420P10LE);
1593 YUV_NBPS(10, BE, PIX_FMT_YUV420P10BE);
1594
1595 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1596                        int width, uint32_t *unused)
1597 {
1598     int i;
1599     for (i=0; i<width; i++) {
1600         int b= src[i*3+0];
1601         int g= src[i*3+1];
1602         int r= src[i*3+2];
1603
1604         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1605     }
1606 }
1607
1608 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1609                         const uint8_t *src2, int width, uint32_t *unused)
1610 {
1611     int i;
1612     for (i=0; i<width; i++) {
1613         int b= src1[3*i + 0];
1614         int g= src1[3*i + 1];
1615         int r= src1[3*i + 2];
1616
1617         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1618         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1619     }
1620     assert(src1 == src2);
1621 }
1622
1623 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1624                              const uint8_t *src2, int width, uint32_t *unused)
1625 {
1626     int i;
1627     for (i=0; i<width; i++) {
1628         int b= src1[6*i + 0] + src1[6*i + 3];
1629         int g= src1[6*i + 1] + src1[6*i + 4];
1630         int r= src1[6*i + 2] + src1[6*i + 5];
1631
1632         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1633         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1634     }
1635     assert(src1 == src2);
1636 }
1637
1638 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1639                        uint32_t *unused)
1640 {
1641     int i;
1642     for (i=0; i<width; i++) {
1643         int r= src[i*3+0];
1644         int g= src[i*3+1];
1645         int b= src[i*3+2];
1646
1647         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1648     }
1649 }
1650
1651 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1652                         const uint8_t *src2, int width, uint32_t *unused)
1653 {
1654     int i;
1655     assert(src1==src2);
1656     for (i=0; i<width; i++) {
1657         int r= src1[3*i + 0];
1658         int g= src1[3*i + 1];
1659         int b= src1[3*i + 2];
1660
1661         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1662         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1663     }
1664 }
1665
1666 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1667                              const uint8_t *src2, int width, uint32_t *unused)
1668 {
1669     int i;
1670     assert(src1==src2);
1671     for (i=0; i<width; i++) {
1672         int r= src1[6*i + 0] + src1[6*i + 3];
1673         int g= src1[6*i + 1] + src1[6*i + 4];
1674         int b= src1[6*i + 2] + src1[6*i + 5];
1675
1676         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1677         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1678     }
1679 }
1680
1681
1682 // bilinear / bicubic scaling
1683 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1684                      int srcW, int xInc,
1685                      const int16_t *filter, const int16_t *filterPos,
1686                      int filterSize)
1687 {
1688     int i;
1689     for (i=0; i<dstW; i++) {
1690         int j;
1691         int srcPos= filterPos[i];
1692         int val=0;
1693         for (j=0; j<filterSize; j++) {
1694             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1695         }
1696         //filter += hFilterSize;
1697         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1698         //dst[i] = val>>7;
1699     }
1700 }
1701
1702 //FIXME all pal and rgb srcFormats could do this convertion as well
1703 //FIXME all scalers more complex than bilinear could do half of this transform
1704 static void chrRangeToJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1705 {
1706     int i;
1707     for (i = 0; i < width; i++) {
1708         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1709         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1710     }
1711 }
1712 static void chrRangeFromJpeg_c(uint16_t *dstU, uint16_t *dstV, int width)
1713 {
1714     int i;
1715     for (i = 0; i < width; i++) {
1716         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1717         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1718     }
1719 }
1720 static void lumRangeToJpeg_c(uint16_t *dst, int width)
1721 {
1722     int i;
1723     for (i = 0; i < width; i++)
1724         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1725 }
1726 static void lumRangeFromJpeg_c(uint16_t *dst, int width)
1727 {
1728     int i;
1729     for (i = 0; i < width; i++)
1730         dst[i] = (dst[i]*14071 + 33561947)>>14;
1731 }
1732
1733 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1734                            const uint8_t *src, int srcW, int xInc)
1735 {
1736     int i;
1737     unsigned int xpos=0;
1738     for (i=0;i<dstWidth;i++) {
1739         register unsigned int xx=xpos>>16;
1740         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1741         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1742         xpos+=xInc;
1743     }
1744 }
1745
1746 // *** horizontal scale Y line to temp buffer
1747 static av_always_inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1748                                      const uint8_t *src, int srcW, int xInc,
1749                                      const int16_t *hLumFilter,
1750                                      const int16_t *hLumFilterPos, int hLumFilterSize,
1751                                      uint8_t *formatConvBuffer,
1752                                      uint32_t *pal, int isAlpha)
1753 {
1754     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1755     void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1756
1757     if (toYV12) {
1758         toYV12(formatConvBuffer, src, srcW, pal);
1759         src= formatConvBuffer;
1760     }
1761
1762     if (!c->hyscale_fast) {
1763         c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1764     } else { // fast bilinear upscale / crap downscale
1765         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1766     }
1767
1768     if (convertRange)
1769         convertRange(dst, dstWidth);
1770 }
1771
1772 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1773                            int dstWidth, const uint8_t *src1,
1774                            const uint8_t *src2, int srcW, int xInc)
1775 {
1776     int i;
1777     unsigned int xpos=0;
1778     for (i=0;i<dstWidth;i++) {
1779         register unsigned int xx=xpos>>16;
1780         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1781         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1782         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1783         xpos+=xInc;
1784     }
1785 }
1786
1787 static av_always_inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1788                                      const uint8_t *src1, const uint8_t *src2,
1789                                      int srcW, int xInc, const int16_t *hChrFilter,
1790                                      const int16_t *hChrFilterPos, int hChrFilterSize,
1791                                      uint8_t *formatConvBuffer, uint32_t *pal)
1792 {
1793     if (c->chrToYV12) {
1794         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1795         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1796         src1= formatConvBuffer;
1797         src2= buf2;
1798     }
1799
1800     if (!c->hcscale_fast) {
1801         c->hScale(dst1, dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1802         c->hScale(dst2, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1803     } else { // fast bilinear upscale / crap downscale
1804         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1805     }
1806
1807     if (c->chrConvertRange)
1808         c->chrConvertRange(dst1, dst2, dstWidth);
1809 }
1810
1811 static av_always_inline void
1812 find_c_packed_planar_out_funcs(SwsContext *c,
1813                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
1814                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
1815                                yuv2packedX_fn *yuv2packedX)
1816 {
1817     enum PixelFormat dstFormat = c->dstFormat;
1818
1819     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1820         *yuv2yuvX     = yuv2nv12X_c;
1821     } else if (is16BPS(dstFormat)) {
1822         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
1823     } else if (is9_OR_10BPS(dstFormat)) {
1824         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
1825             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
1826         } else {
1827             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
1828         }
1829     } else {
1830         *yuv2yuv1     = yuv2yuv1_c;
1831         *yuv2yuvX     = yuv2yuvX_c;
1832     }
1833     if(c->flags & SWS_FULL_CHR_H_INT) {
1834         *yuv2packedX = yuv2rgbX_c_full;
1835     } else {
1836         switch (dstFormat) {
1837         case PIX_FMT_GRAY16BE:
1838             *yuv2packed1 = yuv2gray16BE_1_c;
1839             *yuv2packed2 = yuv2gray16BE_2_c;
1840             *yuv2packedX = yuv2gray16BE_X_c;
1841             break;
1842         case PIX_FMT_GRAY16LE:
1843             *yuv2packed1 = yuv2gray16LE_1_c;
1844             *yuv2packed2 = yuv2gray16LE_2_c;
1845             *yuv2packedX = yuv2gray16LE_X_c;
1846             break;
1847         case PIX_FMT_MONOWHITE:
1848             *yuv2packed1 = yuv2monowhite_1_c;
1849             *yuv2packed2 = yuv2monowhite_2_c;
1850             *yuv2packedX = yuv2monowhite_X_c;
1851             break;
1852         case PIX_FMT_MONOBLACK:
1853             *yuv2packed1 = yuv2monoblack_1_c;
1854             *yuv2packed2 = yuv2monoblack_2_c;
1855             *yuv2packedX = yuv2monoblack_X_c;
1856             break;
1857         case PIX_FMT_YUYV422:
1858             *yuv2packed1 = yuv2422yuyv_1_c;
1859             *yuv2packed2 = yuv2422yuyv_2_c;
1860             *yuv2packedX = yuv2422yuyv_X_c;
1861             break;
1862         case PIX_FMT_UYVY422:
1863             *yuv2packed1 = yuv2422uyvy_1_c;
1864             *yuv2packed2 = yuv2422uyvy_2_c;
1865             *yuv2packedX = yuv2422uyvy_X_c;
1866             break;
1867         default:
1868             *yuv2packed1 = yuv2packed1_c;
1869             *yuv2packed2 = yuv2packed2_c;
1870             *yuv2packedX = yuv2packedX_c;
1871             break;
1872         }
1873     }
1874 }
1875
1876 #define DEBUG_SWSCALE_BUFFERS 0
1877 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
1878
1879 static int swScale(SwsContext *c, const uint8_t* src[],
1880                    int srcStride[], int srcSliceY,
1881                    int srcSliceH, uint8_t* dst[], int dstStride[])
1882 {
1883     /* load a few things into local vars to make the code more readable? and faster */
1884     const int srcW= c->srcW;
1885     const int dstW= c->dstW;
1886     const int dstH= c->dstH;
1887     const int chrDstW= c->chrDstW;
1888     const int chrSrcW= c->chrSrcW;
1889     const int lumXInc= c->lumXInc;
1890     const int chrXInc= c->chrXInc;
1891     const enum PixelFormat dstFormat= c->dstFormat;
1892     const int flags= c->flags;
1893     int16_t *vLumFilterPos= c->vLumFilterPos;
1894     int16_t *vChrFilterPos= c->vChrFilterPos;
1895     int16_t *hLumFilterPos= c->hLumFilterPos;
1896     int16_t *hChrFilterPos= c->hChrFilterPos;
1897     int16_t *vLumFilter= c->vLumFilter;
1898     int16_t *vChrFilter= c->vChrFilter;
1899     int16_t *hLumFilter= c->hLumFilter;
1900     int16_t *hChrFilter= c->hChrFilter;
1901     int32_t *lumMmxFilter= c->lumMmxFilter;
1902     int32_t *chrMmxFilter= c->chrMmxFilter;
1903     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
1904     const int vLumFilterSize= c->vLumFilterSize;
1905     const int vChrFilterSize= c->vChrFilterSize;
1906     const int hLumFilterSize= c->hLumFilterSize;
1907     const int hChrFilterSize= c->hChrFilterSize;
1908     int16_t **lumPixBuf= c->lumPixBuf;
1909     int16_t **chrUPixBuf= c->chrUPixBuf;
1910     int16_t **chrVPixBuf= c->chrVPixBuf;
1911     int16_t **alpPixBuf= c->alpPixBuf;
1912     const int vLumBufSize= c->vLumBufSize;
1913     const int vChrBufSize= c->vChrBufSize;
1914     uint8_t *formatConvBuffer= c->formatConvBuffer;
1915     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
1916     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
1917     int lastDstY;
1918     uint32_t *pal=c->pal_yuv;
1919     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
1920     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
1921     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
1922     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
1923     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
1924
1925     /* vars which will change and which we need to store back in the context */
1926     int dstY= c->dstY;
1927     int lumBufIndex= c->lumBufIndex;
1928     int chrBufIndex= c->chrBufIndex;
1929     int lastInLumBuf= c->lastInLumBuf;
1930     int lastInChrBuf= c->lastInChrBuf;
1931
1932     if (isPacked(c->srcFormat)) {
1933         src[0]=
1934         src[1]=
1935         src[2]=
1936         src[3]= src[0];
1937         srcStride[0]=
1938         srcStride[1]=
1939         srcStride[2]=
1940         srcStride[3]= srcStride[0];
1941     }
1942     srcStride[1]<<= c->vChrDrop;
1943     srcStride[2]<<= c->vChrDrop;
1944
1945     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
1946                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
1947                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
1948     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
1949                    srcSliceY,    srcSliceH,    dstY,    dstH);
1950     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
1951                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
1952
1953     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
1954         static int warnedAlready=0; //FIXME move this into the context perhaps
1955         if (flags & SWS_PRINT_INFO && !warnedAlready) {
1956             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
1957                    "         ->cannot do aligned memory accesses anymore\n");
1958             warnedAlready=1;
1959         }
1960     }
1961
1962     /* Note the user might start scaling the picture in the middle so this
1963        will not get executed. This is not really intended but works
1964        currently, so people might do it. */
1965     if (srcSliceY ==0) {
1966         lumBufIndex=-1;
1967         chrBufIndex=-1;
1968         dstY=0;
1969         lastInLumBuf= -1;
1970         lastInChrBuf= -1;
1971     }
1972
1973     lastDstY= dstY;
1974
1975     for (;dstY < dstH; dstY++) {
1976         unsigned char *dest =dst[0]+dstStride[0]*dstY;
1977         const int chrDstY= dstY>>c->chrDstVSubSample;
1978         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
1979         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
1980         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
1981
1982         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
1983         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
1984         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
1985         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
1986         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
1987         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
1988         int enough_lines;
1989
1990         //handle holes (FAST_BILINEAR & weird filters)
1991         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
1992         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
1993         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
1994         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
1995
1996         DEBUG_BUFFERS("dstY: %d\n", dstY);
1997         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
1998                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
1999         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2000                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2001
2002         // Do we have enough lines in this slice to output the dstY line
2003         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2004
2005         if (!enough_lines) {
2006             lastLumSrcY = srcSliceY + srcSliceH - 1;
2007             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2008             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2009                                             lastLumSrcY, lastChrSrcY);
2010         }
2011
2012         //Do horizontal scaling
2013         while(lastInLumBuf < lastLumSrcY) {
2014             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2015             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2016             lumBufIndex++;
2017             assert(lumBufIndex < 2*vLumBufSize);
2018             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2019             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2020             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2021                     hLumFilter, hLumFilterPos, hLumFilterSize,
2022                     formatConvBuffer,
2023                     pal, 0);
2024             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2025                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2026                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2027                         formatConvBuffer,
2028                         pal, 1);
2029             lastInLumBuf++;
2030             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2031                                lumBufIndex,    lastInLumBuf);
2032         }
2033         while(lastInChrBuf < lastChrSrcY) {
2034             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2035             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2036             chrBufIndex++;
2037             assert(chrBufIndex < 2*vChrBufSize);
2038             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2039             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2040             //FIXME replace parameters through context struct (some at least)
2041
2042             if (c->needs_hcscale)
2043                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2044                           chrDstW, src1, src2, chrSrcW, chrXInc,
2045                           hChrFilter, hChrFilterPos, hChrFilterSize,
2046                           formatConvBuffer, pal);
2047             lastInChrBuf++;
2048             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2049                                chrBufIndex,    lastInChrBuf);
2050         }
2051         //wrap buf index around to stay inside the ring buffer
2052         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2053         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2054         if (!enough_lines)
2055             break; //we can't output a dstY line so let's try with the next slice
2056
2057 #if HAVE_MMX
2058         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2059 #endif
2060         if (dstY >= dstH-2) {
2061             // hmm looks like we can't use MMX here without overwriting this array's tail
2062             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2063                                            &yuv2packed1, &yuv2packed2,
2064                                            &yuv2packedX);
2065         }
2066
2067         {
2068             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2069             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2070             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2071             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2072             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2073                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2074                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2075                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2076                     const int16_t *lumBuf = lumSrcPtr[0];
2077                     const int16_t *chrUBuf= chrUSrcPtr[0];
2078                     const int16_t *chrVBuf= chrVSrcPtr[0];
2079                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2080                     yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
2081                                 uDest, vDest, aDest, dstW, chrDstW);
2082                 } else { //General YV12
2083                     yuv2yuvX(c,
2084                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2085                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
2086                                 chrVSrcPtr, vChrFilterSize,
2087                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2088                 }
2089             } else {
2090                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2091                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2092                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2093                     int chrAlpha= vChrFilter[2*dstY+1];
2094                     yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
2095                                    *chrVSrcPtr, *(chrVSrcPtr+1),
2096                                    alpPixBuf ? *alpSrcPtr : NULL,
2097                                    dest, dstW, chrAlpha, dstFormat, flags, dstY);
2098                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2099                     int lumAlpha= vLumFilter[2*dstY+1];
2100                     int chrAlpha= vChrFilter[2*dstY+1];
2101                     lumMmxFilter[2]=
2102                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2103                     chrMmxFilter[2]=
2104                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2105                     yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
2106                                    *chrVSrcPtr, *(chrVSrcPtr+1),
2107                                    alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2108                                    dest, dstW, lumAlpha, chrAlpha, dstY);
2109                 } else { //general RGB
2110                     yuv2packedX(c,
2111                                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2112                                    vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2113                                    alpSrcPtr, dest, dstW, dstY);
2114                 }
2115             }
2116         }
2117     }
2118
2119     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2120         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2121
2122 #if HAVE_MMX2
2123     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2124         __asm__ volatile("sfence":::"memory");
2125 #endif
2126     emms_c();
2127
2128     /* store changed local vars back in the context */
2129     c->dstY= dstY;
2130     c->lumBufIndex= lumBufIndex;
2131     c->chrBufIndex= chrBufIndex;
2132     c->lastInLumBuf= lastInLumBuf;
2133     c->lastInChrBuf= lastInChrBuf;
2134
2135     return dstY - lastDstY;
2136 }
2137
2138 static av_cold void sws_init_swScale_c(SwsContext *c)
2139 {
2140     enum PixelFormat srcFormat = c->srcFormat;
2141
2142     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2143                                    &c->yuv2packed1, &c->yuv2packed2,
2144                                    &c->yuv2packedX);
2145
2146     c->hScale       = hScale_c;
2147
2148     if (c->flags & SWS_FAST_BILINEAR) {
2149         c->hyscale_fast = hyscale_fast_c;
2150         c->hcscale_fast = hcscale_fast_c;
2151     }
2152
2153     c->chrToYV12 = NULL;
2154     switch(srcFormat) {
2155         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2156         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2157         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2158         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2159         case PIX_FMT_RGB8     :
2160         case PIX_FMT_BGR8     :
2161         case PIX_FMT_PAL8     :
2162         case PIX_FMT_BGR4_BYTE:
2163         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2164         case PIX_FMT_YUV444P9BE:
2165         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
2166         case PIX_FMT_YUV444P9LE:
2167         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
2168         case PIX_FMT_YUV444P10BE:
2169         case PIX_FMT_YUV422P10BE:
2170         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
2171         case PIX_FMT_YUV422P10LE:
2172         case PIX_FMT_YUV444P10LE:
2173         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
2174         case PIX_FMT_YUV420P16BE:
2175         case PIX_FMT_YUV422P16BE:
2176         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
2177         case PIX_FMT_YUV420P16LE:
2178         case PIX_FMT_YUV422P16LE:
2179         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
2180     }
2181     if (c->chrSrcHSubSample) {
2182         switch(srcFormat) {
2183         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2184         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2185         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2186         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2187         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2188         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2189         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2190         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2191         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2192         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2193         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2194         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2195         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2196         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2197         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2198         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2199         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2200         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2201         }
2202     } else {
2203         switch(srcFormat) {
2204         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2205         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2206         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2207         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2208         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2209         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2210         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2211         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2212         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2213         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2214         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2215         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2216         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2217         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2218         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2219         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2220         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2221         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2222         }
2223     }
2224
2225     c->lumToYV12 = NULL;
2226     c->alpToYV12 = NULL;
2227     switch (srcFormat) {
2228     case PIX_FMT_YUV444P9BE:
2229     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
2230     case PIX_FMT_YUV444P9LE:
2231     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
2232     case PIX_FMT_YUV444P10BE:
2233     case PIX_FMT_YUV422P10BE:
2234     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
2235     case PIX_FMT_YUV444P10LE:
2236     case PIX_FMT_YUV422P10LE:
2237     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
2238     case PIX_FMT_YUYV422  :
2239     case PIX_FMT_YUV420P16BE:
2240     case PIX_FMT_YUV422P16BE:
2241     case PIX_FMT_YUV444P16BE:
2242     case PIX_FMT_Y400A    :
2243     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
2244     case PIX_FMT_UYVY422  :
2245     case PIX_FMT_YUV420P16LE:
2246     case PIX_FMT_YUV422P16LE:
2247     case PIX_FMT_YUV444P16LE:
2248     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c;    break;
2249     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2250     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2251     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2252     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2253     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2254     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2255     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2256     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2257     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2258     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2259     case PIX_FMT_RGB8     :
2260     case PIX_FMT_BGR8     :
2261     case PIX_FMT_PAL8     :
2262     case PIX_FMT_BGR4_BYTE:
2263     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2264     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2265     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2266     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2267     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2268     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2269     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2270     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2271     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2272     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2273     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2274     }
2275     if (c->alpPixBuf) {
2276         switch (srcFormat) {
2277         case PIX_FMT_BGRA:
2278         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2279         case PIX_FMT_ABGR:
2280         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2281         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2282         }
2283     }
2284
2285     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2286         if (c->srcRange) {
2287             c->lumConvertRange = lumRangeFromJpeg_c;
2288             c->chrConvertRange = chrRangeFromJpeg_c;
2289         } else {
2290             c->lumConvertRange = lumRangeToJpeg_c;
2291             c->chrConvertRange = chrRangeToJpeg_c;
2292         }
2293     }
2294
2295     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2296           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2297         c->needs_hcscale = 1;
2298 }
2299
2300 SwsFunc ff_getSwsFunc(SwsContext *c)
2301 {
2302     sws_init_swScale_c(c);
2303
2304     if (HAVE_MMX)
2305         ff_sws_init_swScale_mmx(c);
2306     if (HAVE_ALTIVEC)
2307         ff_sws_init_swScale_altivec(c);
2308
2309     return swScale;
2310 }