git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 static const double rgb2yuv_table[8][9]={
  84     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  85     {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5}, //ITU709
  86     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  87     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  88     {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
  89     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  90     {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //DEFAULT / ITU601 / ITU624 / SMPTE 170M
  91     {0.701 , 0.087 , 0.212 , -0.384, 0.5, -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
  92 };
  93
  94 /*
  95 NOTES
  96 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  97
  98 TODO
  99 more intelligent misalignment avoidance for the horizontal scaler
 100 write special vertical cubic upscale version
 101 optimize C code (YV12 / minmax)
 102 add support for packed pixel YUV input & output
 103 add support for Y8 output
 104 optimize BGR24 & BGR32
 105 add BGR4 output support
 106 write special BGR->BGR scaler
 107 */
 108
 109 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
 110 {  1,   3,   1,   3,   1,   3,   1,   3, },
 111 {  2,   0,   2,   0,   2,   0,   2,   0, },
 112 };
 113
 114 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 115 {  6,   2,   6,   2,   6,   2,   6,   2, },
 116 {  0,   4,   0,   4,   0,   4,   0,   4, },
 117 };
 118
 119 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 120 {  8,   4,  11,   7,   8,   4,  11,   7, },
 121 {  2,  14,   1,  13,   2,  14,   1,  13, },
 122 { 10,   6,   9,   5,  10,   6,   9,   5, },
 123 {  0,  12,   3,  15,   0,  12,   3,  15, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 127 { 17,   9,  23,  15,  16,   8,  22,  14, },
 128 {  5,  29,   3,  27,   4,  28,   2,  26, },
 129 { 21,  13,  19,  11,  20,  12,  18,  10, },
 130 {  0,  24,   6,  30,   1,  25,   7,  31, },
 131 { 16,   8,  22,  14,  17,   9,  23,  15, },
 132 {  4,  28,   2,  26,   5,  29,   3,  27, },
 133 { 20,  12,  18,  10,  21,  13,  19,  11, },
 134 {  1,  25,   7,  31,   0,  24,   6,  30, },
 135 };
 136
 137 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 138 {  0,  55,  14,  68,   3,  58,  17,  72, },
 139 { 37,  18,  50,  32,  40,  22,  54,  35, },
 140 {  9,  64,   5,  59,  13,  67,   8,  63, },
 141 { 46,  27,  41,  23,  49,  31,  44,  26, },
 142 {  2,  57,  16,  71,   1,  56,  15,  70, },
 143 { 39,  21,  52,  34,  38,  19,  51,  33, },
 144 { 11,  66,   7,  62,  10,  65,   6,  60, },
 145 { 48,  30,  43,  25,  47,  29,  42,  24, },
 146 };
 147
 148 #if 1
 149 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 150 {117,  62, 158, 103, 113,  58, 155, 100, },
 151 { 34, 199,  21, 186,  31, 196,  17, 182, },
 152 {144,  89, 131,  76, 141,  86, 127,  72, },
 153 {  0, 165,  41, 206,  10, 175,  52, 217, },
 154 {110,  55, 151,  96, 120,  65, 162, 107, },
 155 { 28, 193,  14, 179,  38, 203,  24, 189, },
 156 {138,  83, 124,  69, 148,  93, 134,  79, },
 157 {  7, 172,  48, 213,   3, 168,  45, 210, },
 158 };
 159 #elif 1
 160 // tries to correct a gamma of 1.5
 161 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 162 {  0, 143,  18, 200,   2, 156,  25, 215, },
 163 { 78,  28, 125,  64,  89,  36, 138,  74, },
 164 { 10, 180,   3, 161,  16, 195,   8, 175, },
 165 {109,  51,  93,  38, 121,  60, 105,  47, },
 166 {  1, 152,  23, 210,   0, 147,  20, 205, },
 167 { 85,  33, 134,  71,  81,  30, 130,  67, },
 168 { 14, 190,   6, 171,  12, 185,   5, 166, },
 169 {117,  57, 101,  44, 113,  54,  97,  41, },
 170 };
 171 #elif 1
 172 // tries to correct a gamma of 2.0
 173 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 174 {  0, 124,   8, 193,   0, 140,  12, 213, },
 175 { 55,  14, 104,  42,  66,  19, 119,  52, },
 176 {  3, 168,   1, 145,   6, 187,   3, 162, },
 177 { 86,  31,  70,  21,  99,  39,  82,  28, },
 178 {  0, 134,  11, 206,   0, 129,   9, 200, },
 179 { 62,  17, 114,  48,  58,  16, 109,  45, },
 180 {  5, 181,   2, 157,   4, 175,   1, 151, },
 181 { 95,  36,  78,  26,  90,  34,  74,  24, },
 182 };
 183 #else
 184 // tries to correct a gamma of 2.5
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 186 {  0, 107,   3, 187,   0, 125,   6, 212, },
 187 { 39,   7,  86,  28,  49,  11, 102,  36, },
 188 {  1, 158,   0, 131,   3, 180,   1, 151, },
 189 { 68,  19,  52,  12,  81,  25,  64,  17, },
 190 {  0, 119,   5, 203,   0, 113,   4, 195, },
 191 { 45,   9,  96,  33,  42,   8,  91,  30, },
 192 {  2, 172,   1, 144,   2, 165,   0, 137, },
 193 { 77,  23,  60,  15,  72,  21,  56,  14, },
 194 };
 195 #endif
 196
 197 static av_always_inline void
 198 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 199                       int lumFilterSize, const int16_t *chrFilter,
 200                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 201                       int chrFilterSize, const int16_t **alpSrc,
 202                       uint16_t *dest, uint16_t *uDest, uint16_t *vDest,
 203                       uint16_t *aDest, int dstW, int chrDstW,
 204                       int big_endian, int output_bits)
 205 {
 206     //FIXME Optimize (just quickly written not optimized..)
 207     int i;
 208     int shift = 11 + 16 - output_bits;
 209
 210 #define output_pixel(pos, val) \
 211     if (big_endian) { \
 212         if (output_bits == 16) { \
 213             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 214         } else { \
 215             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 216         } \
 217     } else { \
 218         if (output_bits == 16) { \
 219             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 220         } else { \
 221             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 222         } \
 223     }
 224     for (i = 0; i < dstW; i++) {
 225         int val = 1 << (26-output_bits);
 226         int j;
 227
 228         for (j = 0; j < lumFilterSize; j++)
 229             val += lumSrc[j][i] * lumFilter[j];
 230
 231         output_pixel(&dest[i], val);
 232     }
 233
 234     if (uDest) {
 235         for (i = 0; i < chrDstW; i++) {
 236             int u = 1 << (26-output_bits);
 237             int v = 1 << (26-output_bits);
 238             int j;
 239
 240             for (j = 0; j < chrFilterSize; j++) {
 241                 u += chrUSrc[j][i] * chrFilter[j];
 242                 v += chrVSrc[j][i] * chrFilter[j];
 243             }
 244
 245             output_pixel(&uDest[i], u);
 246             output_pixel(&vDest[i], v);
 247         }
 248     }
 249
 250     if (CONFIG_SWSCALE_ALPHA && aDest) {
 251         for (i = 0; i < dstW; i++) {
 252             int val = 1 << (26-output_bits);
 253             int j;
 254
 255             for (j = 0; j < lumFilterSize; j++)
 256                 val += alpSrc[j][i] * lumFilter[j];
 257
 258             output_pixel(&aDest[i], val);
 259         }
 260     }
 261 #undef output_pixel
 262 }
 263
 264 #define yuv2NBPS(bits, BE_LE, is_be) \
 265 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 266                               const int16_t **lumSrc, int lumFilterSize, \
 267                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 268                               const int16_t **chrVSrc, \
 269                               int chrFilterSize, const int16_t **alpSrc, \
 270                               uint8_t *_dest, uint8_t *_uDest, uint8_t *_vDest, \
 271                               uint8_t *_aDest, int dstW, int chrDstW) \
 272 { \
 273     uint16_t *dest  = (uint16_t *) _dest,  *uDest = (uint16_t *) _uDest, \
 274              *vDest = (uint16_t *) _vDest, *aDest = (uint16_t *) _aDest; \
 275     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 276                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 277                           alpSrc, \
 278                           dest, uDest, vDest, aDest, \
 279                           dstW, chrDstW, is_be, bits); \
 280 }
 281 yuv2NBPS( 9, BE, 1);
 282 yuv2NBPS( 9, LE, 0);
 283 yuv2NBPS(10, BE, 1);
 284 yuv2NBPS(10, LE, 0);
 285 yuv2NBPS(16, BE, 1);
 286 yuv2NBPS(16, LE, 0);
 287
 288 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 289                        const int16_t **lumSrc, int lumFilterSize,
 290                        const int16_t *chrFilter, const int16_t **chrUSrc,
 291                        const int16_t **chrVSrc,
 292                        int chrFilterSize, const int16_t **alpSrc,
 293                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 294                        uint8_t *aDest, int dstW, int chrDstW)
 295 {
 296     //FIXME Optimize (just quickly written not optimized..)
 297     int i;
 298     for (i=0; i<dstW; i++) {
 299         int val=1<<18;
 300         int j;
 301         for (j=0; j<lumFilterSize; j++)
 302             val += lumSrc[j][i] * lumFilter[j];
 303
 304         dest[i]= av_clip_uint8(val>>19);
 305     }
 306
 307     if (uDest)
 308         for (i=0; i<chrDstW; i++) {
 309             int u=1<<18;
 310             int v=1<<18;
 311             int j;
 312             for (j=0; j<chrFilterSize; j++) {
 313                 u += chrUSrc[j][i] * chrFilter[j];
 314                 v += chrVSrc[j][i] * chrFilter[j];
 315             }
 316
 317             uDest[i]= av_clip_uint8(u>>19);
 318             vDest[i]= av_clip_uint8(v>>19);
 319         }
 320
 321     if (CONFIG_SWSCALE_ALPHA && aDest)
 322         for (i=0; i<dstW; i++) {
 323             int val=1<<18;
 324             int j;
 325             for (j=0; j<lumFilterSize; j++)
 326                 val += alpSrc[j][i] * lumFilter[j];
 327
 328             aDest[i]= av_clip_uint8(val>>19);
 329         }
 330 }
 331
 332 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 333                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 334                        const int16_t *alpSrc,
 335                        uint8_t *dest, uint8_t *uDest, uint8_t *vDest,
 336                        uint8_t *aDest, int dstW, int chrDstW)
 337 {
 338     int i;
 339     for (i=0; i<dstW; i++) {
 340         int val= (lumSrc[i]+64)>>7;
 341         dest[i]= av_clip_uint8(val);
 342     }
 343
 344     if (uDest)
 345         for (i=0; i<chrDstW; i++) {
 346             int u=(chrUSrc[i]+64)>>7;
 347             int v=(chrVSrc[i]+64)>>7;
 348             uDest[i]= av_clip_uint8(u);
 349             vDest[i]= av_clip_uint8(v);
 350         }
 351
 352     if (CONFIG_SWSCALE_ALPHA && aDest)
 353         for (i=0; i<dstW; i++) {
 354             int val= (alpSrc[i]+64)>>7;
 355             aDest[i]= av_clip_uint8(val);
 356         }
 357 }
 358
 359 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 360                         const int16_t **lumSrc, int lumFilterSize,
 361                         const int16_t *chrFilter, const int16_t **chrUSrc,
 362                         const int16_t **chrVSrc, int chrFilterSize,
 363                         const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest,
 364                         uint8_t *vDest, uint8_t *aDest,
 365                         int dstW, int chrDstW)
 366 {
 367     enum PixelFormat dstFormat = c->dstFormat;
 368
 369     //FIXME Optimize (just quickly written not optimized..)
 370     int i;
 371     for (i=0; i<dstW; i++) {
 372         int val=1<<18;
 373         int j;
 374         for (j=0; j<lumFilterSize; j++)
 375             val += lumSrc[j][i] * lumFilter[j];
 376
 377         dest[i]= av_clip_uint8(val>>19);
 378     }
 379
 380     if (!uDest)
 381         return;
 382
 383     if (dstFormat == PIX_FMT_NV12)
 384         for (i=0; i<chrDstW; i++) {
 385             int u=1<<18;
 386             int v=1<<18;
 387             int j;
 388             for (j=0; j<chrFilterSize; j++) {
 389                 u += chrUSrc[j][i] * chrFilter[j];
 390                 v += chrVSrc[j][i] * chrFilter[j];
 391             }
 392
 393             uDest[2*i]= av_clip_uint8(u>>19);
 394             uDest[2*i+1]= av_clip_uint8(v>>19);
 395         }
 396     else
 397         for (i=0; i<chrDstW; i++) {
 398             int u=1<<18;
 399             int v=1<<18;
 400             int j;
 401             for (j=0; j<chrFilterSize; j++) {
 402                 u += chrUSrc[j][i] * chrFilter[j];
 403                 v += chrVSrc[j][i] * chrFilter[j];
 404             }
 405
 406             uDest[2*i]= av_clip_uint8(v>>19);
 407             uDest[2*i+1]= av_clip_uint8(u>>19);
 408         }
 409 }
 410
 411 #define output_pixel(pos, val) \
 412         if (target == PIX_FMT_GRAY16BE) { \
 413             AV_WB16(pos, val); \
 414         } else { \
 415             AV_WL16(pos, val); \
 416         }
 417
 418 static av_always_inline void
 419 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 420                         const int16_t **lumSrc, int lumFilterSize,
 421                         const int16_t *chrFilter, const int16_t **chrUSrc,
 422                         const int16_t **chrVSrc, int chrFilterSize,
 423                         const int16_t **alpSrc, uint8_t *dest, int dstW,
 424                         int y, enum PixelFormat target)
 425 {
 426     int i;
 427
 428     for (i = 0; i < (dstW >> 1); i++) {
 429         int j;
 430         int Y1 = 1 << 18;
 431         int Y2 = 1 << 18;
 432         const int i2 = 2 * i;
 433
 434         for (j = 0; j < lumFilterSize; j++) {
 435             Y1 += lumSrc[j][i2]   * lumFilter[j];
 436             Y2 += lumSrc[j][i2+1] * lumFilter[j];
 437         }
 438         Y1 >>= 11;
 439         Y2 >>= 11;
 440         if ((Y1 | Y2) & 0x10000) {
 441             Y1 = av_clip_uint16(Y1);
 442             Y2 = av_clip_uint16(Y2);
 443         }
 444         output_pixel(&dest[2 * i2 + 0], Y1);
 445         output_pixel(&dest[2 * i2 + 2], Y2);
 446     }
 447 }
 448
 449 static av_always_inline void
 450 yuv2gray16_2_c_template(SwsContext *c, const uint16_t *buf0,
 451                         const uint16_t *buf1, const uint16_t *ubuf0,
 452                         const uint16_t *ubuf1, const uint16_t *vbuf0,
 453                         const uint16_t *vbuf1, const uint16_t *abuf0,
 454                         const uint16_t *abuf1, uint8_t *dest, int dstW,
 455                         int yalpha, int uvalpha, int y,
 456                         enum PixelFormat target)
 457 {
 458     int  yalpha1 = 4095 - yalpha; \
 459     int i;
 460
 461     for (i = 0; i < (dstW >> 1); i++) {
 462         const int i2 = 2 * i;
 463         int Y1 = (buf0[i2  ] * yalpha1 + buf1[i2  ] * yalpha) >> 11;
 464         int Y2 = (buf0[i2+1] * yalpha1 + buf1[i2+1] * yalpha) >> 11;
 465
 466         output_pixel(&dest[2 * i2 + 0], Y1);
 467         output_pixel(&dest[2 * i2 + 2], Y2);
 468     }
 469 }
 470
 471 static av_always_inline void
 472 yuv2gray16_1_c_template(SwsContext *c, const uint16_t *buf0,
 473                         const uint16_t *ubuf0, const uint16_t *ubuf1,
 474                         const uint16_t *vbuf0, const uint16_t *vbuf1,
 475                         const uint16_t *abuf0, uint8_t *dest, int dstW,
 476                         int uvalpha, enum PixelFormat dstFormat,
 477                         int flags, int y, enum PixelFormat target)
 478 {
 479     int i;
 480
 481     for (i = 0; i < (dstW >> 1); i++) {
 482         const int i2 = 2 * i;
 483         int Y1 = buf0[i2  ] << 1;
 484         int Y2 = buf0[i2+1] << 1;
 485
 486         output_pixel(&dest[2 * i2 + 0], Y1);
 487         output_pixel(&dest[2 * i2 + 2], Y2);
 488     }
 489 }
 490
 491 #undef output_pixel
 492
 493 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 494 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 495                         const int16_t **lumSrc, int lumFilterSize, \
 496                         const int16_t *chrFilter, const int16_t **chrUSrc, \
 497                         const int16_t **chrVSrc, int chrFilterSize, \
 498                         const int16_t **alpSrc, uint8_t *dest, int dstW, \
 499                         int y) \
 500 { \
 501     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 502                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 503                           alpSrc, dest, dstW, y, fmt); \
 504 } \
 505  \
 506 static void name ## ext ## _2_c(SwsContext *c, const uint16_t *buf0, \
 507                         const uint16_t *buf1, const uint16_t *ubuf0, \
 508                         const uint16_t *ubuf1, const uint16_t *vbuf0, \
 509                         const uint16_t *vbuf1, const uint16_t *abuf0, \
 510                         const uint16_t *abuf1, uint8_t *dest, int dstW, \
 511                         int yalpha, int uvalpha, int y) \
 512 { \
 513     name ## base ## _2_c_template(c, buf0, buf1, ubuf0, ubuf1, \
 514                           vbuf0, vbuf1, abuf0, abuf1, \
 515                           dest, dstW, yalpha, uvalpha, y, fmt); \
 516 } \
 517  \
 518 static void name ## ext ## _1_c(SwsContext *c, const uint16_t *buf0, \
 519                         const uint16_t *ubuf0, const uint16_t *ubuf1, \
 520                         const uint16_t *vbuf0, const uint16_t *vbuf1, \
 521                         const uint16_t *abuf0, uint8_t *dest, int dstW, \
 522                         int uvalpha, enum PixelFormat dstFormat, \
 523                         int flags, int y) \
 524 { \
 525     name ## base ## _1_c_template(c, buf0, ubuf0, ubuf1, vbuf0, \
 526                           vbuf1, abuf0, dest, dstW, uvalpha, \
 527                           dstFormat, flags, y, fmt); \
 528 }
 529
 530 YUV2PACKEDWRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 531 YUV2PACKEDWRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 532
 533 #define output_pixel(pos, acc) \
 534     if (target == PIX_FMT_MONOBLACK) { \
 535         pos = acc; \
 536     } else { \
 537         pos = ~acc; \
 538     }
 539
 540 static av_always_inline void
 541 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 542                       const int16_t **lumSrc, int lumFilterSize,
 543                       const int16_t *chrFilter, const int16_t **chrUSrc,
 544                       const int16_t **chrVSrc, int chrFilterSize,
 545                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 546                       int y, enum PixelFormat target)
 547 {
 548     const uint8_t * const d128=dither_8x8_220[y&7];
 549     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 550     int i;
 551     int acc = 0;
 552
 553     for (i = 0; i < dstW - 1; i += 2) {
 554         int j;
 555         int Y1 = 1 << 18;
 556         int Y2 = 1 << 18;
 557
 558         for (j = 0; j < lumFilterSize; j++) {
 559             Y1 += lumSrc[j][i]   * lumFilter[j];
 560             Y2 += lumSrc[j][i+1] * lumFilter[j];
 561         }
 562         Y1 >>= 19;
 563         Y2 >>= 19;
 564         if ((Y1 | Y2) & 0x100) {
 565             Y1 = av_clip_uint8(Y1);
 566             Y2 = av_clip_uint8(Y2);
 567         }
 568         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 569         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 570         if ((i & 7) == 6) {
 571             output_pixel(*dest++, acc);
 572         }
 573     }
 574 }
 575
 576 static av_always_inline void
 577 yuv2mono_2_c_template(SwsContext *c, const uint16_t *buf0,
 578                       const uint16_t *buf1, const uint16_t *ubuf0,
 579                       const uint16_t *ubuf1, const uint16_t *vbuf0,
 580                       const uint16_t *vbuf1, const uint16_t *abuf0,
 581                       const uint16_t *abuf1, uint8_t *dest, int dstW,
 582                       int yalpha, int uvalpha, int y,
 583                       enum PixelFormat target)
 584 {
 585     const uint8_t * const d128 = dither_8x8_220[y & 7];
 586     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 587     int  yalpha1 = 4095 - yalpha;
 588     int i;
 589
 590     for (i = 0; i < dstW - 7; i += 8) {
 591         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 592         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 593         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 594         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 595         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 596         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 597         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 598         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 599         output_pixel(*dest++, acc);
 600     }
 601 }
 602
 603 static av_always_inline void
 604 yuv2mono_1_c_template(SwsContext *c, const uint16_t *buf0,
 605                       const uint16_t *ubuf0, const uint16_t *ubuf1,
 606                       const uint16_t *vbuf0, const uint16_t *vbuf1,
 607                       const uint16_t *abuf0, uint8_t *dest, int dstW,
 608                       int uvalpha, enum PixelFormat dstFormat,
 609                       int flags, int y, enum PixelFormat target)
 610 {
 611     const uint8_t * const d128 = dither_8x8_220[y & 7];
 612     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 613     int i;
 614
 615     for (i = 0; i < dstW - 7; i += 8) {
 616         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 617         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 618         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 619         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 620         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 621         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 622         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 623         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 624         output_pixel(*dest++, acc);
 625     }
 626 }
 627
 628 #undef output_pixel
 629
 630 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 631 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 632
 633 #define output_pixels(pos, Y1, U, Y2, V) \
 634     if (target == PIX_FMT_YUYV422) { \
 635         dest[pos + 0] = Y1; \
 636         dest[pos + 1] = U;  \
 637         dest[pos + 2] = Y2; \
 638         dest[pos + 3] = V;  \
 639     } else { \
 640         dest[pos + 0] = U;  \
 641         dest[pos + 1] = Y1; \
 642         dest[pos + 2] = V;  \
 643         dest[pos + 3] = Y2; \
 644     }
 645
 646 static av_always_inline void
 647 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 648                      const int16_t **lumSrc, int lumFilterSize,
 649                      const int16_t *chrFilter, const int16_t **chrUSrc,
 650                      const int16_t **chrVSrc, int chrFilterSize,
 651                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 652                      int y, enum PixelFormat target)
 653 {
 654     int i;
 655
 656     for (i = 0; i < (dstW >> 1); i++) {
 657         int j;
 658         int Y1 = 1 << 18;
 659         int Y2 = 1 << 18;
 660         int U  = 1 << 18;
 661         int V  = 1 << 18;
 662
 663         for (j = 0; j < lumFilterSize; j++) {
 664             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 665             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 666         }
 667         for (j = 0; j < chrFilterSize; j++) {
 668             U += chrUSrc[j][i] * chrFilter[j];
 669             V += chrVSrc[j][i] * chrFilter[j];
 670         }
 671         Y1 >>= 19;
 672         Y2 >>= 19;
 673         U  >>= 19;
 674         V  >>= 19;
 675         if ((Y1 | Y2 | U | V) & 0x100) {
 676             Y1 = av_clip_uint8(Y1);
 677             Y2 = av_clip_uint8(Y2);
 678             U  = av_clip_uint8(U);
 679             V  = av_clip_uint8(V);
 680         }
 681         output_pixels(4*i, Y1, U, Y2, V);
 682     }
 683 }
 684
 685 static av_always_inline void
 686 yuv2422_2_c_template(SwsContext *c, const uint16_t *buf0,
 687                      const uint16_t *buf1, const uint16_t *ubuf0,
 688                      const uint16_t *ubuf1, const uint16_t *vbuf0,
 689                      const uint16_t *vbuf1, const uint16_t *abuf0,
 690                      const uint16_t *abuf1, uint8_t *dest, int dstW,
 691                      int yalpha, int uvalpha, int y,
 692                      enum PixelFormat target)
 693 {
 694     int  yalpha1 = 4095 - yalpha;
 695     int uvalpha1 = 4095 - uvalpha;
 696     int i;
 697
 698     for (i = 0; i < (dstW >> 1); i++) {
 699         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 700         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 701         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 702         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 703
 704         output_pixels(i * 4, Y1, U, Y2, V);
 705     }
 706 }
 707
 708 static av_always_inline void
 709 yuv2422_1_c_template(SwsContext *c, const uint16_t *buf0,
 710                      const uint16_t *ubuf0, const uint16_t *ubuf1,
 711                      const uint16_t *vbuf0, const uint16_t *vbuf1,
 712                      const uint16_t *abuf0, uint8_t *dest, int dstW,
 713                      int uvalpha, enum PixelFormat dstFormat,
 714                      int flags, int y, enum PixelFormat target)
 715 {
 716     int i;
 717
 718     if (uvalpha < 2048) {
 719         for (i = 0; i < (dstW >> 1); i++) {
 720             int Y1 = buf0[i * 2]     >> 7;
 721             int Y2 = buf0[i * 2 + 1] >> 7;
 722             int U  = ubuf1[i]        >> 7;
 723             int V  = vbuf1[i]        >> 7;
 724
 725             output_pixels(i * 4, Y1, U, Y2, V);
 726         }
 727     } else {
 728         for (i = 0; i < (dstW >> 1); i++) {
 729             int Y1 =  buf0[i * 2]          >> 7;
 730             int Y2 =  buf0[i * 2 + 1]      >> 7;
 731             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 732             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 733
 734             output_pixels(i * 4, Y1, U, Y2, V);
 735         }
 736     }
 737 }
 738
 739 #undef output_pixels
 740
 741 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 742 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 743
 744 #define r_b ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? r : b)
 745 #define b_r ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? b : r)
 746
 747 static av_always_inline void
 748 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 749                        const int16_t **lumSrc, int lumFilterSize,
 750                        const int16_t *chrFilter, const int16_t **chrUSrc,
 751                        const int16_t **chrVSrc, int chrFilterSize,
 752                        const int16_t **alpSrc, uint8_t *dest, int dstW,
 753                        int y, enum PixelFormat target)
 754 {
 755     int i;
 756
 757     for (i = 0; i < (dstW >> 1); i++) {
 758         int j;
 759         int Y1 = 1 << 18;
 760         int Y2 = 1 << 18;
 761         int U  = 1 << 18;
 762         int V  = 1 << 18;
 763         const uint8_t *r, *g, *b;
 764
 765         for (j = 0; j < lumFilterSize; j++) {
 766             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 767             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 768         }
 769         for (j = 0; j < chrFilterSize; j++) {
 770             U += chrUSrc[j][i] * chrFilter[j];
 771             V += chrVSrc[j][i] * chrFilter[j];
 772         }
 773         Y1 >>= 19;
 774         Y2 >>= 19;
 775         U  >>= 19;
 776         V  >>= 19;
 777         if ((Y1 | Y2 | U | V) & 0x100) {
 778             Y1 = av_clip_uint8(Y1);
 779             Y2 = av_clip_uint8(Y2);
 780             U  = av_clip_uint8(U);
 781             V  = av_clip_uint8(V);
 782         }
 783
 784         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
 785         r = (const uint8_t *) c->table_rV[V];
 786         g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]);
 787         b = (const uint8_t *) c->table_bU[U];
 788
 789         dest[ 0] = dest[ 1] = r_b[Y1];
 790         dest[ 2] = dest[ 3] =   g[Y1];
 791         dest[ 4] = dest[ 5] = b_r[Y1];
 792         dest[ 6] = dest[ 7] = r_b[Y2];
 793         dest[ 8] = dest[ 9] =   g[Y2];
 794         dest[10] = dest[11] = b_r[Y2];
 795         dest += 12;
 796     }
 797 }
 798
 799 static av_always_inline void
 800 yuv2rgb48_2_c_template(SwsContext *c, const uint16_t *buf0,
 801                        const uint16_t *buf1, const uint16_t *ubuf0,
 802                        const uint16_t *ubuf1, const uint16_t *vbuf0,
 803                        const uint16_t *vbuf1, const uint16_t *abuf0,
 804                        const uint16_t *abuf1, uint8_t *dest, int dstW,
 805                        int yalpha, int uvalpha, int y,
 806                        enum PixelFormat target)
 807 {
 808     int  yalpha1 = 4095 - yalpha;
 809     int uvalpha1 = 4095 - uvalpha;
 810     int i;
 811
 812     for (i = 0; i < (dstW >> 1); i++) {
 813         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 814         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 815         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 816         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 817         const uint8_t *r = (const uint8_t *) c->table_rV[V],
 818                       *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 819                       *b = (const uint8_t *) c->table_bU[U];
 820
 821         dest[ 0] = dest[ 1] = r_b[Y1];
 822         dest[ 2] = dest[ 3] =   g[Y1];
 823         dest[ 4] = dest[ 5] = b_r[Y1];
 824         dest[ 6] = dest[ 7] = r_b[Y2];
 825         dest[ 8] = dest[ 9] =   g[Y2];
 826         dest[10] = dest[11] = b_r[Y2];
 827         dest += 12;
 828     }
 829 }
 830
 831 static av_always_inline void
 832 yuv2rgb48_1_c_template(SwsContext *c, const uint16_t *buf0,
 833                        const uint16_t *ubuf0, const uint16_t *ubuf1,
 834                        const uint16_t *vbuf0, const uint16_t *vbuf1,
 835                        const uint16_t *abuf0, uint8_t *dest, int dstW,
 836                        int uvalpha, enum PixelFormat dstFormat,
 837                        int flags, int y, enum PixelFormat target)
 838 {
 839     int i;
 840
 841     if (uvalpha < 2048) {
 842         for (i = 0; i < (dstW >> 1); i++) {
 843             int Y1 = buf0[i * 2]     >> 7;
 844             int Y2 = buf0[i * 2 + 1] >> 7;
 845             int U  = ubuf1[i]        >> 7;
 846             int V  = vbuf1[i]        >> 7;
 847             const uint8_t *r = (const uint8_t *) c->table_rV[V],
 848                           *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 849                           *b = (const uint8_t *) c->table_bU[U];
 850
 851             dest[ 0] = dest[ 1] = r_b[Y1];
 852             dest[ 2] = dest[ 3] =   g[Y1];
 853             dest[ 4] = dest[ 5] = b_r[Y1];
 854             dest[ 6] = dest[ 7] = r_b[Y2];
 855             dest[ 8] = dest[ 9] =   g[Y2];
 856             dest[10] = dest[11] = b_r[Y2];
 857             dest += 12;
 858         }
 859     } else {
 860         for (i = 0; i < (dstW >> 1); i++) {
 861             int Y1 =  buf0[i * 2]          >> 7;
 862             int Y2 =  buf0[i * 2 + 1]      >> 7;
 863             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 864             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 865             const uint8_t *r = (const uint8_t *) c->table_rV[V],
 866                           *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 867                           *b = (const uint8_t *) c->table_bU[U];
 868
 869             dest[ 0] = dest[ 1] = r_b[Y1];
 870             dest[ 2] = dest[ 3] =   g[Y1];
 871             dest[ 4] = dest[ 5] = b_r[Y1];
 872             dest[ 6] = dest[ 7] = r_b[Y2];
 873             dest[ 8] = dest[ 9] =   g[Y2];
 874             dest[10] = dest[11] = b_r[Y2];
 875             dest += 12;
 876         }
 877     }
 878 }
 879
 880 #undef r_b
 881 #undef b_r
 882
 883 YUV2PACKEDWRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
 884 //YUV2PACKEDWRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
 885 YUV2PACKEDWRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
 886 //YUV2PACKEDWRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
 887
 888 static av_always_inline void
 889 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
 890               int U, int V, int A1, int A2,
 891               const void *_r, const void *_g, const void *_b, int y,
 892               enum PixelFormat target, int hasAlpha)
 893 {
 894     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 895         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
 896         uint32_t *dest = (uint32_t *) _dest;
 897         const uint32_t *r = (const uint32_t *) _r;
 898         const uint32_t *g = (const uint32_t *) _g;
 899         const uint32_t *b = (const uint32_t *) _b;
 900
 901 #if CONFIG_SMALL
 902         int sh = hasAlpha ? ((fmt == PIX_FMT_RGB32_1 || fmt == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 903
 904         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
 905         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
 906 #else
 907         if (hasAlpha) {
 908             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
 909
 910             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
 911             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
 912         } else {
 913             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
 914             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
 915         }
 916 #endif
 917     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
 918         uint8_t *dest = (uint8_t *) _dest;
 919         const uint8_t *r = (const uint8_t *) _r;
 920         const uint8_t *g = (const uint8_t *) _g;
 921         const uint8_t *b = (const uint8_t *) _b;
 922
 923 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
 924 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
 925         dest[i * 6 + 0] = r_b[Y1];
 926         dest[i * 6 + 1] =   g[Y1];
 927         dest[i * 6 + 2] = b_r[Y1];
 928         dest[i * 6 + 3] = r_b[Y2];
 929         dest[i * 6 + 4] =   g[Y2];
 930         dest[i * 6 + 5] = b_r[Y2];
 931 #undef r_b
 932 #undef b_r
 933     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
 934                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
 935                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
 936         uint16_t *dest = (uint16_t *) _dest;
 937         const uint16_t *r = (const uint16_t *) _r;
 938         const uint16_t *g = (const uint16_t *) _g;
 939         const uint16_t *b = (const uint16_t *) _b;
 940         int dr1, dg1, db1, dr2, dg2, db2;
 941
 942         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
 943             dr1 = dither_2x2_8[ y & 1     ][0];
 944             dg1 = dither_2x2_4[ y & 1     ][0];
 945             db1 = dither_2x2_8[(y & 1) ^ 1][0];
 946             dr2 = dither_2x2_8[ y & 1     ][1];
 947             dg2 = dither_2x2_4[ y & 1     ][1];
 948             db2 = dither_2x2_8[(y & 1) ^ 1][1];
 949         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
 950             dr1 = dither_2x2_8[ y & 1     ][0];
 951             dg1 = dither_2x2_8[ y & 1     ][1];
 952             db1 = dither_2x2_8[(y & 1) ^ 1][0];
 953             dr2 = dither_2x2_8[ y & 1     ][1];
 954             dg2 = dither_2x2_8[ y & 1     ][0];
 955             db2 = dither_2x2_8[(y & 1) ^ 1][1];
 956         } else {
 957             dr1 = dither_4x4_16[ y & 3     ][0];
 958             dg1 = dither_4x4_16[ y & 3     ][1];
 959             db1 = dither_4x4_16[(y & 3) ^ 3][0];
 960             dr2 = dither_4x4_16[ y & 3     ][1];
 961             dg2 = dither_4x4_16[ y & 3     ][0];
 962             db2 = dither_4x4_16[(y & 3) ^ 3][1];
 963         }
 964
 965         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
 966         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
 967     } else /* 8/4-bit */ {
 968         uint8_t *dest = (uint8_t *) _dest;
 969         const uint8_t *r = (const uint8_t *) _r;
 970         const uint8_t *g = (const uint8_t *) _g;
 971         const uint8_t *b = (const uint8_t *) _b;
 972         int dr1, dg1, db1, dr2, dg2, db2;
 973
 974         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
 975             const uint8_t * const d64 = dither_8x8_73[y & 7];
 976             const uint8_t * const d32 = dither_8x8_32[y & 7];
 977             dr1 = dg1 = d32[(i * 2 + 0) & 7];
 978             db1 =       d64[(i * 2 + 0) & 7];
 979             dr2 = dg2 = d32[(i * 2 + 1) & 7];
 980             db2 =       d64[(i * 2 + 1) & 7];
 981         } else {
 982             const uint8_t * const d64  = dither_8x8_73 [y & 7];
 983             const uint8_t * const d128 = dither_8x8_220[y & 7];
 984             dr1 = db1 = d128[(i * 2 + 0) & 7];
 985             dg1 =        d64[(i * 2 + 0) & 7];
 986             dr2 = db2 = d128[(i * 2 + 1) & 7];
 987             dg2 =        d64[(i * 2 + 1) & 7];
 988         }
 989
 990         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
 991             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
 992                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
 993         } else {
 994             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
 995             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
 996         }
 997     }
 998 }
 999
1000 static av_always_inline void
1001 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1002                      const int16_t **lumSrc, int lumFilterSize,
1003                      const int16_t *chrFilter, const int16_t **chrUSrc,
1004                      const int16_t **chrVSrc, int chrFilterSize,
1005                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1006                      int y, enum PixelFormat target, int hasAlpha)
1007 {
1008     int i;
1009
1010     for (i = 0; i < (dstW >> 1); i++) {
1011         int j;
1012         int Y1 = 1 << 18;
1013         int Y2 = 1 << 18;
1014         int U  = 1 << 18;
1015         int V  = 1 << 18;
1016         int av_unused A1, A2;
1017         const void *r, *g, *b;
1018
1019         for (j = 0; j < lumFilterSize; j++) {
1020             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1021             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1022         }
1023         for (j = 0; j < chrFilterSize; j++) {
1024             U += chrUSrc[j][i] * chrFilter[j];
1025             V += chrVSrc[j][i] * chrFilter[j];
1026         }
1027         Y1 >>= 19;
1028         Y2 >>= 19;
1029         U  >>= 19;
1030         V  >>= 19;
1031         if ((Y1 | Y2 | U | V) & 0x100) {
1032             Y1 = av_clip_uint8(Y1);
1033             Y2 = av_clip_uint8(Y2);
1034             U  = av_clip_uint8(U);
1035             V  = av_clip_uint8(V);
1036         }
1037         if (hasAlpha) {\
1038             A1 = 1 << 18;
1039             A2 = 1 << 18;
1040             for (j = 0; j < lumFilterSize; j++) {
1041                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1042                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1043             }
1044             A1 >>= 19;
1045             A2 >>= 19;
1046             if ((A1 | A2) & 0x100) {
1047                 A1 = av_clip_uint8(A1);
1048                 A2 = av_clip_uint8(A2);
1049             }
1050         }
1051
1052         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1053         r =  c->table_rV[V];
1054         g = (c->table_gU[U] + c->table_gV[V]);
1055         b =  c->table_bU[U];
1056
1057         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1058                       r, g, b, y, target, hasAlpha);
1059     }
1060 }
1061
1062 static av_always_inline void
1063 yuv2rgb_2_c_template(SwsContext *c, const uint16_t *buf0,
1064                      const uint16_t *buf1, const uint16_t *ubuf0,
1065                      const uint16_t *ubuf1, const uint16_t *vbuf0,
1066                      const uint16_t *vbuf1, const uint16_t *abuf0,
1067                      const uint16_t *abuf1, uint8_t *dest, int dstW,
1068                      int yalpha, int uvalpha, int y,
1069                      enum PixelFormat target, int hasAlpha)
1070 {
1071     int  yalpha1 = 4095 - yalpha;
1072     int uvalpha1 = 4095 - uvalpha;
1073     int i;
1074
1075     for (i = 0; i < (dstW >> 1); i++) {
1076         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1077         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1078         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1079         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1080         int A1, A2;
1081         const void *r =  c->table_rV[V],
1082                    *g = (c->table_gU[U] + c->table_gV[V]),
1083                    *b =  c->table_bU[U];
1084
1085         if (hasAlpha) {
1086             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1087             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1088         }
1089
1090         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1091                       r, g, b, y, target, hasAlpha);
1092     }
1093 }
1094
1095 static av_always_inline void
1096 yuv2rgb_1_c_template(SwsContext *c, const uint16_t *buf0,
1097                      const uint16_t *ubuf0, const uint16_t *ubuf1,
1098                      const uint16_t *vbuf0, const uint16_t *vbuf1,
1099                      const uint16_t *abuf0, uint8_t *dest, int dstW,
1100                      int uvalpha, enum PixelFormat dstFormat,
1101                      int flags, int y, enum PixelFormat target,
1102                      int hasAlpha)
1103 {
1104     int i;
1105
1106     if (uvalpha < 2048) {
1107         for (i = 0; i < (dstW >> 1); i++) {
1108             int Y1 = buf0[i * 2]     >> 7;
1109             int Y2 = buf0[i * 2 + 1] >> 7;
1110             int U  = ubuf1[i]        >> 7;
1111             int V  = vbuf1[i]        >> 7;
1112             int A1, A2;
1113             const void *r =  c->table_rV[V],
1114                        *g = (c->table_gU[U] + c->table_gV[V]),
1115                        *b =  c->table_bU[U];
1116
1117             if (hasAlpha) {
1118                 A1 = abuf0[i * 2    ] >> 7;
1119                 A2 = abuf0[i * 2 + 1] >> 7;
1120             }
1121
1122             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1123                           r, g, b, y, target, hasAlpha);
1124         }
1125     } else {
1126         for (i = 0; i < (dstW >> 1); i++) {
1127             int Y1 =  buf0[i * 2]          >> 7;
1128             int Y2 =  buf0[i * 2 + 1]      >> 7;
1129             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1130             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1131             int A1, A2;
1132             const void *r =  c->table_rV[V],
1133                        *g = (c->table_gU[U] + c->table_gV[V]),
1134                        *b =  c->table_bU[U];
1135
1136             if (hasAlpha) {
1137                 A1 = abuf0[i * 2    ] >> 7;
1138                 A2 = abuf0[i * 2 + 1] >> 7;
1139             }
1140
1141             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1142                           r, g, b, y, target, hasAlpha);
1143         }
1144     }
1145 }
1146
1147 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1148 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1149                                 const int16_t **lumSrc, int lumFilterSize, \
1150                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1151                                 const int16_t **chrVSrc, int chrFilterSize, \
1152                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1153                                 int y) \
1154 { \
1155     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1156                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1157                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1158 } \
1159  \
1160 static void name ## ext ## _2_c(SwsContext *c, const uint16_t *buf0, \
1161                                 const uint16_t *buf1, const uint16_t *ubuf0, \
1162                                 const uint16_t *ubuf1, const uint16_t *vbuf0, \
1163                                 const uint16_t *vbuf1, const uint16_t *abuf0, \
1164                                 const uint16_t *abuf1, uint8_t *dest, int dstW, \
1165                                 int yalpha, int uvalpha, int y) \
1166 { \
1167     name ## base ## _2_c_template(c, buf0, buf1, ubuf0, ubuf1, \
1168                                   vbuf0, vbuf1, abuf0, abuf1, \
1169                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1170 } \
1171  \
1172 static void name ## ext ## _1_c(SwsContext *c, const uint16_t *buf0, \
1173                                 const uint16_t *ubuf0, const uint16_t *ubuf1, \
1174                                 const uint16_t *vbuf0, const uint16_t *vbuf1, \
1175                                 const uint16_t *abuf0, uint8_t *dest, int dstW, \
1176                                 int uvalpha, enum PixelFormat dstFormat, \
1177                                 int flags, int y) \
1178 { \
1179     name ## base ## _1_c_template(c, buf0, ubuf0, ubuf1, vbuf0, \
1180                                   vbuf1, abuf0, dest, dstW, uvalpha, \
1181                                   dstFormat, flags, y, fmt, hasAlpha); \
1182 }
1183
1184 #if CONFIG_SMALL
1185 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1186 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1187 #else
1188 #if CONFIG_SWSCALE_ALPHA
1189 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1190 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1191 #endif
1192 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1193 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1194 #endif
1195 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1196 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1197 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1198 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1199 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1200 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1201 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1202 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1203
1204 #define YSCALE_YUV_2_RGBX_FULL_C(rnd,alpha) \
1205     for (i=0; i<dstW; i++) {\
1206         int j;\
1207         int Y = 0;\
1208         int U = -128<<19;\
1209         int V = -128<<19;\
1210         int av_unused A;\
1211         int R,G,B;\
1212         \
1213         for (j=0; j<lumFilterSize; j++) {\
1214             Y += lumSrc[j][i     ] * lumFilter[j];\
1215         }\
1216         for (j=0; j<chrFilterSize; j++) {\
1217             U += chrUSrc[j][i] * chrFilter[j];\
1218             V += chrVSrc[j][i] * chrFilter[j];\
1219         }\
1220         Y >>=10;\
1221         U >>=10;\
1222         V >>=10;\
1223         if (alpha) {\
1224             A = rnd;\
1225             for (j=0; j<lumFilterSize; j++)\
1226                 A += alpSrc[j][i     ] * lumFilter[j];\
1227             A >>=19;\
1228             if (A&0x100)\
1229                 A = av_clip_uint8(A);\
1230         }\
1231         Y-= c->yuv2rgb_y_offset;\
1232         Y*= c->yuv2rgb_y_coeff;\
1233         Y+= rnd;\
1234         R= Y + V*c->yuv2rgb_v2r_coeff;\
1235         G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
1236         B= Y +                          U*c->yuv2rgb_u2b_coeff;\
1237         if ((R|G|B)&(0xC0000000)) {\
1238             R = av_clip_uintp2(R, 30); \
1239             G = av_clip_uintp2(G, 30); \
1240             B = av_clip_uintp2(B, 30); \
1241         }
1242
1243 static void yuv2rgbX_c_full(SwsContext *c, const int16_t *lumFilter,
1244                             const int16_t **lumSrc, int lumFilterSize,
1245                             const int16_t *chrFilter, const int16_t **chrUSrc,
1246                             const int16_t **chrVSrc, int chrFilterSize,
1247                             const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
1248 {
1249     int i;
1250     int step= c->dstFormatBpp/8;
1251     int aidx= 3;
1252
1253     switch(c->dstFormat) {
1254     case PIX_FMT_ARGB:
1255         dest++;
1256         aidx= 0;
1257     case PIX_FMT_RGB24:
1258         aidx--;
1259     case PIX_FMT_RGBA:
1260         if (CONFIG_SMALL) {
1261             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
1262             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
1263                 dest[aidx]= needAlpha ? A : 255;
1264                 dest[0]= R>>22;
1265                 dest[1]= G>>22;
1266                 dest[2]= B>>22;
1267                 dest+= step;
1268             }
1269         } else {
1270             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1271                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
1272                     dest[aidx]= A;
1273                     dest[0]= R>>22;
1274                     dest[1]= G>>22;
1275                     dest[2]= B>>22;
1276                     dest+= step;
1277                 }
1278             } else {
1279                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
1280                     dest[aidx]= 255;
1281                     dest[0]= R>>22;
1282                     dest[1]= G>>22;
1283                     dest[2]= B>>22;
1284                     dest+= step;
1285                 }
1286             }
1287         }
1288         break;
1289     case PIX_FMT_ABGR:
1290         dest++;
1291         aidx= 0;
1292     case PIX_FMT_BGR24:
1293         aidx--;
1294     case PIX_FMT_BGRA:
1295         if (CONFIG_SMALL) {
1296             int needAlpha = CONFIG_SWSCALE_ALPHA && c->alpPixBuf;
1297             YSCALE_YUV_2_RGBX_FULL_C(1<<21, needAlpha)
1298                 dest[aidx]= needAlpha ? A : 255;
1299                 dest[0]= B>>22;
1300                 dest[1]= G>>22;
1301                 dest[2]= R>>22;
1302                 dest+= step;
1303             }
1304         } else {
1305             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1306                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 1)
1307                     dest[aidx]= A;
1308                     dest[0]= B>>22;
1309                     dest[1]= G>>22;
1310                     dest[2]= R>>22;
1311                     dest+= step;
1312                 }
1313             } else {
1314                 YSCALE_YUV_2_RGBX_FULL_C(1<<21, 0)
1315                     dest[aidx]= 255;
1316                     dest[0]= B>>22;
1317                     dest[1]= G>>22;
1318                     dest[2]= R>>22;
1319                     dest+= step;
1320                 }
1321             }
1322         }
1323         break;
1324     default:
1325         assert(0);
1326     }
1327 }
1328
1329 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1330                                        int width, int height,
1331                                        int y, uint8_t val)
1332 {
1333     int i;
1334     uint8_t *ptr = plane + stride*y;
1335     for (i=0; i<height; i++) {
1336         memset(ptr, val, width);
1337         ptr += stride;
1338     }
1339 }
1340
1341 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1342
1343 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1344 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1345
1346 static av_always_inline void
1347 rgb48ToY_c_template(uint8_t *dst, const uint8_t *src, int width,
1348                     enum PixelFormat origin)
1349 {
1350     int i;
1351     for (i = 0; i < width; i++) {
1352         int r_b = input_pixel(&src[i*6+0]) >> 8;
1353         int   g = input_pixel(&src[i*6+2]) >> 8;
1354         int b_r = input_pixel(&src[i*6+4]) >> 8;
1355
1356         dst[i] = (RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1357     }
1358 }
1359
1360 static av_always_inline void
1361 rgb48ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1362                     const uint8_t *src1, const uint8_t *src2,
1363                     int width, enum PixelFormat origin)
1364 {
1365     int i;
1366     assert(src1==src2);
1367     for (i = 0; i < width; i++) {
1368         int r_b = input_pixel(&src1[i*6+0]) >> 8;
1369         int   g = input_pixel(&src1[i*6+2]) >> 8;
1370         int b_r = input_pixel(&src1[i*6+4]) >> 8;
1371
1372         dstU[i] = (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1373         dstV[i] = (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1374     }
1375 }
1376
1377 static av_always_inline void
1378 rgb48ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1379                           const uint8_t *src1, const uint8_t *src2,
1380                           int width, enum PixelFormat origin)
1381 {
1382     int i;
1383     assert(src1==src2);
1384     for (i = 0; i < width; i++) {
1385         int r_b = (input_pixel(&src1[12*i + 0]) >> 8) + (input_pixel(&src1[12*i + 6]) >> 8);
1386         int   g = (input_pixel(&src1[12*i + 2]) >> 8) + (input_pixel(&src1[12*i + 8]) >> 8);
1387         int b_r = (input_pixel(&src1[12*i + 4]) >> 8) + (input_pixel(&src1[12*i + 10]) >> 8);
1388
1389         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1390         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT)) >> (RGB2YUV_SHIFT+1);
1391     }
1392 }
1393
1394 #undef r
1395 #undef b
1396 #undef input_pixel
1397
1398 #define rgb48funcs(pattern, BE_LE, origin) \
1399 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *dst, const uint8_t *src, \
1400                                     int width, uint32_t *unused) \
1401 { \
1402     rgb48ToY_c_template(dst, src, width, origin); \
1403 } \
1404  \
1405 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1406                                     const uint8_t *src1, const uint8_t *src2, \
1407                                     int width, uint32_t *unused) \
1408 { \
1409     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1410 } \
1411  \
1412 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1413                                     const uint8_t *src1, const uint8_t *src2, \
1414                                     int width, uint32_t *unused) \
1415 { \
1416     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1417 }
1418
1419 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1420 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1421 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1422 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1423
1424 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1425                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1426                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1427
1428 static av_always_inline void
1429 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1430                        int width, enum PixelFormat origin,
1431                        int shr,   int shg,   int shb, int shp,
1432                        int maskr, int maskg, int maskb,
1433                        int rsh,   int gsh,   int bsh, int S)
1434 {
1435     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1436               rnd = 33 << (S - 1);
1437     int i;
1438
1439     for (i = 0; i < width; i++) {
1440         int px = input_pixel(i) >> shp;
1441         int b = (px & maskb) >> shb;
1442         int g = (px & maskg) >> shg;
1443         int r = (px & maskr) >> shr;
1444
1445         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1446     }
1447 }
1448
1449 static av_always_inline void
1450 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1451                         const uint8_t *src, int width,
1452                         enum PixelFormat origin,
1453                         int shr,   int shg,   int shb, int shp,
1454                         int maskr, int maskg, int maskb,
1455                         int rsh,   int gsh,   int bsh, int S)
1456 {
1457     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1458               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1459               rnd = 257 << (S - 1);
1460     int i;
1461
1462     for (i = 0; i < width; i++) {
1463         int px = input_pixel(i) >> shp;
1464         int b = (px & maskb) >> shb;
1465         int g = (px & maskg) >> shg;
1466         int r = (px & maskr) >> shr;
1467
1468         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1469         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1470     }
1471 }
1472
1473 static av_always_inline void
1474 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1475                              const uint8_t *src, int width,
1476                              enum PixelFormat origin,
1477                              int shr,   int shg,   int shb, int shp,
1478                              int maskr, int maskg, int maskb,
1479                              int rsh,   int gsh,   int bsh, int S)
1480 {
1481     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1482               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1483               rnd = 257 << S, maskgx = ~(maskr | maskb);
1484     int i;
1485
1486     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1487     for (i = 0; i < width; i++) {
1488         int px0 = input_pixel(2 * i + 0) >> shp;
1489         int px1 = input_pixel(2 * i + 1) >> shp;
1490         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1491         int rb = px0 + px1 - g;
1492
1493         b = (rb & maskb) >> shb;
1494         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1495             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1496             g >>= shg;
1497         } else {
1498             g = (g  & maskg) >> shg;
1499         }
1500         r = (rb & maskr) >> shr;
1501
1502         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1503         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1504     }
1505 }
1506
1507 #undef input_pixel
1508
1509 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1510                          maskg, maskb, rsh, gsh, bsh, S) \
1511 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1512                           int width, uint32_t *unused) \
1513 { \
1514     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1515                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1516 } \
1517  \
1518 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1519                            const uint8_t *src, const uint8_t *dummy, \
1520                            int width, uint32_t *unused) \
1521 { \
1522     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1523                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1524 } \
1525  \
1526 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1527                                 const uint8_t *src, const uint8_t *dummy, \
1528                                 int width, uint32_t *unused) \
1529 { \
1530     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1531                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1532 }
1533
1534 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1535 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1536 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1537 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1538 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1539 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1540 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1541 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1542 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1543 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1544 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1545 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1546
1547 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1548 {
1549     int i;
1550     for (i=0; i<width; i++) {
1551         dst[i]= src[4*i];
1552     }
1553 }
1554
1555 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1556 {
1557     int i;
1558     for (i=0; i<width; i++) {
1559         dst[i]= src[4*i+3];
1560     }
1561 }
1562
1563 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1564 {
1565     int i;
1566     for (i=0; i<width; i++) {
1567         int d= src[i];
1568
1569         dst[i]= pal[d] & 0xFF;
1570     }
1571 }
1572
1573 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1574                       const uint8_t *src1, const uint8_t *src2,
1575                       int width, uint32_t *pal)
1576 {
1577     int i;
1578     assert(src1 == src2);
1579     for (i=0; i<width; i++) {
1580         int p= pal[src1[i]];
1581
1582         dstU[i]= p>>8;
1583         dstV[i]= p>>16;
1584     }
1585 }
1586
1587 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1588                           int width, uint32_t *unused)
1589 {
1590     int i, j;
1591     for (i=0; i<width/8; i++) {
1592         int d= ~src[i];
1593         for(j=0; j<8; j++)
1594             dst[8*i+j]= ((d>>(7-j))&1)*255;
1595     }
1596 }
1597
1598 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1599                           int width, uint32_t *unused)
1600 {
1601     int i, j;
1602     for (i=0; i<width/8; i++) {
1603         int d= src[i];
1604         for(j=0; j<8; j++)
1605             dst[8*i+j]= ((d>>(7-j))&1)*255;
1606     }
1607 }
1608
1609 //FIXME yuy2* can read up to 7 samples too much
1610
1611 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1612                       uint32_t *unused)
1613 {
1614     int i;
1615     for (i=0; i<width; i++)
1616         dst[i]= src[2*i];
1617 }
1618
1619 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1620                        const uint8_t *src2, int width, uint32_t *unused)
1621 {
1622     int i;
1623     for (i=0; i<width; i++) {
1624         dstU[i]= src1[4*i + 1];
1625         dstV[i]= src1[4*i + 3];
1626     }
1627     assert(src1 == src2);
1628 }
1629
1630 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1631                      const uint8_t *src2, int width, uint32_t *unused)
1632 {
1633     int i;
1634     for (i=0; i<width; i++) {
1635         dstU[i]= src1[2*i + 1];
1636         dstV[i]= src2[2*i + 1];
1637     }
1638 }
1639
1640 /* This is almost identical to the previous, end exists only because
1641  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1642 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1643                       uint32_t *unused)
1644 {
1645     int i;
1646     for (i=0; i<width; i++)
1647         dst[i]= src[2*i+1];
1648 }
1649
1650 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1651                        const uint8_t *src2, int width, uint32_t *unused)
1652 {
1653     int i;
1654     for (i=0; i<width; i++) {
1655         dstU[i]= src1[4*i + 0];
1656         dstV[i]= src1[4*i + 2];
1657     }
1658     assert(src1 == src2);
1659 }
1660
1661 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1662                      const uint8_t *src2, int width, uint32_t *unused)
1663 {
1664     int i;
1665     for (i=0; i<width; i++) {
1666         dstU[i]= src1[2*i];
1667         dstV[i]= src2[2*i];
1668     }
1669 }
1670
1671 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1672                                         const uint8_t *src, int width)
1673 {
1674     int i;
1675     for (i = 0; i < width; i++) {
1676         dst1[i] = src[2*i+0];
1677         dst2[i] = src[2*i+1];
1678     }
1679 }
1680
1681 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1682                        const uint8_t *src1, const uint8_t *src2,
1683                        int width, uint32_t *unused)
1684 {
1685     nvXXtoUV_c(dstU, dstV, src1, width);
1686 }
1687
1688 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1689                        const uint8_t *src1, const uint8_t *src2,
1690                        int width, uint32_t *unused)
1691 {
1692     nvXXtoUV_c(dstV, dstU, src1, width);
1693 }
1694
1695 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1696
1697 // FIXME Maybe dither instead.
1698 static av_always_inline void
1699 yuv9_OR_10ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1700                           const uint8_t *_srcU, const uint8_t *_srcV,
1701                           int width, enum PixelFormat origin, int depth)
1702 {
1703     int i;
1704     const uint16_t *srcU = (const uint16_t *) _srcU;
1705     const uint16_t *srcV = (const uint16_t *) _srcV;
1706
1707     for (i = 0; i < width; i++) {
1708         dstU[i] = input_pixel(&srcU[i]) >> (depth - 8);
1709         dstV[i] = input_pixel(&srcV[i]) >> (depth - 8);
1710     }
1711 }
1712
1713 static av_always_inline void
1714 yuv9_or_10ToY_c_template(uint8_t *dstY, const uint8_t *_srcY,
1715                          int width, enum PixelFormat origin, int depth)
1716 {
1717     int i;
1718     const uint16_t *srcY = (const uint16_t*)_srcY;
1719
1720     for (i = 0; i < width; i++)
1721         dstY[i] = input_pixel(&srcY[i]) >> (depth - 8);
1722 }
1723
1724 #undef input_pixel
1725
1726 #define YUV_NBPS(depth, BE_LE, origin) \
1727 static void BE_LE ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1728                                      const uint8_t *srcU, const uint8_t *srcV, \
1729                                      int width, uint32_t *unused) \
1730 { \
1731     yuv9_OR_10ToUV_c_template(dstU, dstV, srcU, srcV, width, origin, depth); \
1732 } \
1733 static void BE_LE ## depth ## ToY_c(uint8_t *dstY, const uint8_t *srcY, \
1734                                     int width, uint32_t *unused) \
1735 { \
1736     yuv9_or_10ToY_c_template(dstY, srcY, width, origin, depth); \
1737 }
1738
1739 YUV_NBPS( 9, LE, PIX_FMT_YUV420P9LE);
1740 YUV_NBPS( 9, BE, PIX_FMT_YUV420P9BE);
1741 YUV_NBPS(10, LE, PIX_FMT_YUV420P10LE);
1742 YUV_NBPS(10, BE, PIX_FMT_YUV420P10BE);
1743
1744 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1745                        int width, uint32_t *unused)
1746 {
1747     int i;
1748     for (i=0; i<width; i++) {
1749         int b= src[i*3+0];
1750         int g= src[i*3+1];
1751         int r= src[i*3+2];
1752
1753         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1754     }
1755 }
1756
1757 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1758                         const uint8_t *src2, int width, uint32_t *unused)
1759 {
1760     int i;
1761     for (i=0; i<width; i++) {
1762         int b= src1[3*i + 0];
1763         int g= src1[3*i + 1];
1764         int r= src1[3*i + 2];
1765
1766         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1767         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1768     }
1769     assert(src1 == src2);
1770 }
1771
1772 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1773                              const uint8_t *src2, int width, uint32_t *unused)
1774 {
1775     int i;
1776     for (i=0; i<width; i++) {
1777         int b= src1[6*i + 0] + src1[6*i + 3];
1778         int g= src1[6*i + 1] + src1[6*i + 4];
1779         int r= src1[6*i + 2] + src1[6*i + 5];
1780
1781         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1782         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1783     }
1784     assert(src1 == src2);
1785 }
1786
1787 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1788                        uint32_t *unused)
1789 {
1790     int i;
1791     for (i=0; i<width; i++) {
1792         int r= src[i*3+0];
1793         int g= src[i*3+1];
1794         int b= src[i*3+2];
1795
1796         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1797     }
1798 }
1799
1800 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1801                         const uint8_t *src2, int width, uint32_t *unused)
1802 {
1803     int i;
1804     assert(src1==src2);
1805     for (i=0; i<width; i++) {
1806         int r= src1[3*i + 0];
1807         int g= src1[3*i + 1];
1808         int b= src1[3*i + 2];
1809
1810         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1811         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1812     }
1813 }
1814
1815 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1816                              const uint8_t *src2, int width, uint32_t *unused)
1817 {
1818     int i;
1819     assert(src1==src2);
1820     for (i=0; i<width; i++) {
1821         int r= src1[6*i + 0] + src1[6*i + 3];
1822         int g= src1[6*i + 1] + src1[6*i + 4];
1823         int b= src1[6*i + 2] + src1[6*i + 5];
1824
1825         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1826         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1827     }
1828 }
1829
1830 // bilinear / bicubic scaling
1831 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1832                      const int16_t *filter, const int16_t *filterPos,
1833                      int filterSize)
1834 {
1835     int i;
1836     for (i=0; i<dstW; i++) {
1837         int j;
1838         int srcPos= filterPos[i];
1839         int val=0;
1840         for (j=0; j<filterSize; j++) {
1841             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1842         }
1843         //filter += hFilterSize;
1844         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1845         //dst[i] = val>>7;
1846     }
1847 }
1848
1849 //FIXME all pal and rgb srcFormats could do this convertion as well
1850 //FIXME all scalers more complex than bilinear could do half of this transform
1851 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1852 {
1853     int i;
1854     for (i = 0; i < width; i++) {
1855         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1856         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1857     }
1858 }
1859 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1860 {
1861     int i;
1862     for (i = 0; i < width; i++) {
1863         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1864         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1865     }
1866 }
1867 static void lumRangeToJpeg_c(int16_t *dst, int width)
1868 {
1869     int i;
1870     for (i = 0; i < width; i++)
1871         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1872 }
1873 static void lumRangeFromJpeg_c(int16_t *dst, int width)
1874 {
1875     int i;
1876     for (i = 0; i < width; i++)
1877         dst[i] = (dst[i]*14071 + 33561947)>>14;
1878 }
1879
1880 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1881                            const uint8_t *src, int srcW, int xInc)
1882 {
1883     int i;
1884     unsigned int xpos=0;
1885     for (i=0;i<dstWidth;i++) {
1886         register unsigned int xx=xpos>>16;
1887         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1888         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1889         xpos+=xInc;
1890     }
1891 }
1892
1893 // *** horizontal scale Y line to temp buffer
1894 static av_always_inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1895                                      const uint8_t *src, int srcW, int xInc,
1896                                      const int16_t *hLumFilter,
1897                                      const int16_t *hLumFilterPos, int hLumFilterSize,
1898                                      uint8_t *formatConvBuffer,
1899                                      uint32_t *pal, int isAlpha)
1900 {
1901     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1902     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1903
1904     if (toYV12) {
1905         toYV12(formatConvBuffer, src, srcW, pal);
1906         src= formatConvBuffer;
1907     }
1908
1909     if (!c->hyscale_fast) {
1910         c->hScale(dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
1911     } else { // fast bilinear upscale / crap downscale
1912         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1913     }
1914
1915     if (convertRange)
1916         convertRange(dst, dstWidth);
1917 }
1918
1919 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1920                            int dstWidth, const uint8_t *src1,
1921                            const uint8_t *src2, int srcW, int xInc)
1922 {
1923     int i;
1924     unsigned int xpos=0;
1925     for (i=0;i<dstWidth;i++) {
1926         register unsigned int xx=xpos>>16;
1927         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1928         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1929         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1930         xpos+=xInc;
1931     }
1932 }
1933
1934 static av_always_inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
1935                                      const uint8_t *src1, const uint8_t *src2,
1936                                      int srcW, int xInc, const int16_t *hChrFilter,
1937                                      const int16_t *hChrFilterPos, int hChrFilterSize,
1938                                      uint8_t *formatConvBuffer, uint32_t *pal)
1939 {
1940     if (c->chrToYV12) {
1941         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW, 16);
1942         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
1943         src1= formatConvBuffer;
1944         src2= buf2;
1945     }
1946
1947     if (!c->hcscale_fast) {
1948         c->hScale(dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
1949         c->hScale(dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
1950     } else { // fast bilinear upscale / crap downscale
1951         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
1952     }
1953
1954     if (c->chrConvertRange)
1955         c->chrConvertRange(dst1, dst2, dstWidth);
1956 }
1957
1958 static av_always_inline void
1959 find_c_packed_planar_out_funcs(SwsContext *c,
1960                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
1961                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
1962                                yuv2packedX_fn *yuv2packedX)
1963 {
1964     enum PixelFormat dstFormat = c->dstFormat;
1965
1966     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
1967         *yuv2yuvX     = yuv2nv12X_c;
1968     } else if (is16BPS(dstFormat)) {
1969         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
1970     } else if (is9_OR_10BPS(dstFormat)) {
1971         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
1972             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
1973         } else {
1974             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
1975         }
1976     } else {
1977         *yuv2yuv1     = yuv2yuv1_c;
1978         *yuv2yuvX     = yuv2yuvX_c;
1979     }
1980     if(c->flags & SWS_FULL_CHR_H_INT) {
1981         *yuv2packedX = yuv2rgbX_c_full;
1982     } else {
1983         switch (dstFormat) {
1984         case PIX_FMT_GRAY16BE:
1985             *yuv2packed1 = yuv2gray16BE_1_c;
1986             *yuv2packed2 = yuv2gray16BE_2_c;
1987             *yuv2packedX = yuv2gray16BE_X_c;
1988             break;
1989         case PIX_FMT_GRAY16LE:
1990             *yuv2packed1 = yuv2gray16LE_1_c;
1991             *yuv2packed2 = yuv2gray16LE_2_c;
1992             *yuv2packedX = yuv2gray16LE_X_c;
1993             break;
1994         case PIX_FMT_MONOWHITE:
1995             *yuv2packed1 = yuv2monowhite_1_c;
1996             *yuv2packed2 = yuv2monowhite_2_c;
1997             *yuv2packedX = yuv2monowhite_X_c;
1998             break;
1999         case PIX_FMT_MONOBLACK:
2000             *yuv2packed1 = yuv2monoblack_1_c;
2001             *yuv2packed2 = yuv2monoblack_2_c;
2002             *yuv2packedX = yuv2monoblack_X_c;
2003             break;
2004         case PIX_FMT_YUYV422:
2005             *yuv2packed1 = yuv2yuyv422_1_c;
2006             *yuv2packed2 = yuv2yuyv422_2_c;
2007             *yuv2packedX = yuv2yuyv422_X_c;
2008             break;
2009         case PIX_FMT_UYVY422:
2010             *yuv2packed1 = yuv2uyvy422_1_c;
2011             *yuv2packed2 = yuv2uyvy422_2_c;
2012             *yuv2packedX = yuv2uyvy422_X_c;
2013             break;
2014         case PIX_FMT_RGB48LE:
2015             //*yuv2packed1 = yuv2rgb48le_1_c;
2016             //*yuv2packed2 = yuv2rgb48le_2_c;
2017             //*yuv2packedX = yuv2rgb48le_X_c;
2018             //break;
2019         case PIX_FMT_RGB48BE:
2020             *yuv2packed1 = yuv2rgb48be_1_c;
2021             *yuv2packed2 = yuv2rgb48be_2_c;
2022             *yuv2packedX = yuv2rgb48be_X_c;
2023             break;
2024         case PIX_FMT_BGR48LE:
2025             //*yuv2packed1 = yuv2bgr48le_1_c;
2026             //*yuv2packed2 = yuv2bgr48le_2_c;
2027             //*yuv2packedX = yuv2bgr48le_X_c;
2028             //break;
2029         case PIX_FMT_BGR48BE:
2030             *yuv2packed1 = yuv2bgr48be_1_c;
2031             *yuv2packed2 = yuv2bgr48be_2_c;
2032             *yuv2packedX = yuv2bgr48be_X_c;
2033             break;
2034         case PIX_FMT_RGB32:
2035         case PIX_FMT_BGR32:
2036 #if CONFIG_SMALL
2037             *yuv2packed1 = yuv2rgb32_1_c;
2038             *yuv2packed2 = yuv2rgb32_2_c;
2039             *yuv2packedX = yuv2rgb32_X_c;
2040 #else
2041 #if CONFIG_SWSCALE_ALPHA
2042                 if (c->alpPixBuf) {
2043                     *yuv2packed1 = yuv2rgba32_1_c;
2044                     *yuv2packed2 = yuv2rgba32_2_c;
2045                     *yuv2packedX = yuv2rgba32_X_c;
2046                 } else
2047 #endif /* CONFIG_SWSCALE_ALPHA */
2048                 {
2049                     *yuv2packed1 = yuv2rgbx32_1_c;
2050                     *yuv2packed2 = yuv2rgbx32_2_c;
2051                     *yuv2packedX = yuv2rgbx32_X_c;
2052                 }
2053 #endif /* !CONFIG_SMALL */
2054             break;
2055         case PIX_FMT_RGB32_1:
2056         case PIX_FMT_BGR32_1:
2057 #if CONFIG_SMALL
2058                 *yuv2packed1 = yuv2rgb32_1_1_c;
2059                 *yuv2packed2 = yuv2rgb32_1_2_c;
2060                 *yuv2packedX = yuv2rgb32_1_X_c;
2061 #else
2062 #if CONFIG_SWSCALE_ALPHA
2063                 if (c->alpPixBuf) {
2064                     *yuv2packed1 = yuv2rgba32_1_1_c;
2065                     *yuv2packed2 = yuv2rgba32_1_2_c;
2066                     *yuv2packedX = yuv2rgba32_1_X_c;
2067                 } else
2068 #endif /* CONFIG_SWSCALE_ALPHA */
2069                 {
2070                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2071                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2072                     *yuv2packedX = yuv2rgbx32_1_X_c;
2073                 }
2074 #endif /* !CONFIG_SMALL */
2075                 break;
2076         case PIX_FMT_RGB24:
2077             *yuv2packed1 = yuv2rgb24_1_c;
2078             *yuv2packed2 = yuv2rgb24_2_c;
2079             *yuv2packedX = yuv2rgb24_X_c;
2080             break;
2081         case PIX_FMT_BGR24:
2082             *yuv2packed1 = yuv2bgr24_1_c;
2083             *yuv2packed2 = yuv2bgr24_2_c;
2084             *yuv2packedX = yuv2bgr24_X_c;
2085             break;
2086         case PIX_FMT_RGB565:
2087         case PIX_FMT_BGR565:
2088             *yuv2packed1 = yuv2rgb16_1_c;
2089             *yuv2packed2 = yuv2rgb16_2_c;
2090             *yuv2packedX = yuv2rgb16_X_c;
2091             break;
2092         case PIX_FMT_RGB555:
2093         case PIX_FMT_BGR555:
2094             *yuv2packed1 = yuv2rgb15_1_c;
2095             *yuv2packed2 = yuv2rgb15_2_c;
2096             *yuv2packedX = yuv2rgb15_X_c;
2097             break;
2098         case PIX_FMT_RGB444:
2099         case PIX_FMT_BGR444:
2100             *yuv2packed1 = yuv2rgb12_1_c;
2101             *yuv2packed2 = yuv2rgb12_2_c;
2102             *yuv2packedX = yuv2rgb12_X_c;
2103             break;
2104         case PIX_FMT_RGB8:
2105         case PIX_FMT_BGR8:
2106             *yuv2packed1 = yuv2rgb8_1_c;
2107             *yuv2packed2 = yuv2rgb8_2_c;
2108             *yuv2packedX = yuv2rgb8_X_c;
2109             break;
2110         case PIX_FMT_RGB4:
2111         case PIX_FMT_BGR4:
2112             *yuv2packed1 = yuv2rgb4_1_c;
2113             *yuv2packed2 = yuv2rgb4_2_c;
2114             *yuv2packedX = yuv2rgb4_X_c;
2115             break;
2116         case PIX_FMT_RGB4_BYTE:
2117         case PIX_FMT_BGR4_BYTE:
2118             *yuv2packed1 = yuv2rgb4b_1_c;
2119             *yuv2packed2 = yuv2rgb4b_2_c;
2120             *yuv2packedX = yuv2rgb4b_X_c;
2121             break;
2122         }
2123     }
2124 }
2125
2126 #define DEBUG_SWSCALE_BUFFERS 0
2127 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2128
2129 static int swScale(SwsContext *c, const uint8_t* src[],
2130                    int srcStride[], int srcSliceY,
2131                    int srcSliceH, uint8_t* dst[], int dstStride[])
2132 {
2133     /* load a few things into local vars to make the code more readable? and faster */
2134     const int srcW= c->srcW;
2135     const int dstW= c->dstW;
2136     const int dstH= c->dstH;
2137     const int chrDstW= c->chrDstW;
2138     const int chrSrcW= c->chrSrcW;
2139     const int lumXInc= c->lumXInc;
2140     const int chrXInc= c->chrXInc;
2141     const enum PixelFormat dstFormat= c->dstFormat;
2142     const int flags= c->flags;
2143     int16_t *vLumFilterPos= c->vLumFilterPos;
2144     int16_t *vChrFilterPos= c->vChrFilterPos;
2145     int16_t *hLumFilterPos= c->hLumFilterPos;
2146     int16_t *hChrFilterPos= c->hChrFilterPos;
2147     int16_t *vLumFilter= c->vLumFilter;
2148     int16_t *vChrFilter= c->vChrFilter;
2149     int16_t *hLumFilter= c->hLumFilter;
2150     int16_t *hChrFilter= c->hChrFilter;
2151     int32_t *lumMmxFilter= c->lumMmxFilter;
2152     int32_t *chrMmxFilter= c->chrMmxFilter;
2153     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2154     const int vLumFilterSize= c->vLumFilterSize;
2155     const int vChrFilterSize= c->vChrFilterSize;
2156     const int hLumFilterSize= c->hLumFilterSize;
2157     const int hChrFilterSize= c->hChrFilterSize;
2158     int16_t **lumPixBuf= c->lumPixBuf;
2159     int16_t **chrUPixBuf= c->chrUPixBuf;
2160     int16_t **chrVPixBuf= c->chrVPixBuf;
2161     int16_t **alpPixBuf= c->alpPixBuf;
2162     const int vLumBufSize= c->vLumBufSize;
2163     const int vChrBufSize= c->vChrBufSize;
2164     uint8_t *formatConvBuffer= c->formatConvBuffer;
2165     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2166     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2167     int lastDstY;
2168     uint32_t *pal=c->pal_yuv;
2169     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2170     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2171     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2172     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2173     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2174
2175     /* vars which will change and which we need to store back in the context */
2176     int dstY= c->dstY;
2177     int lumBufIndex= c->lumBufIndex;
2178     int chrBufIndex= c->chrBufIndex;
2179     int lastInLumBuf= c->lastInLumBuf;
2180     int lastInChrBuf= c->lastInChrBuf;
2181
2182     if (isPacked(c->srcFormat)) {
2183         src[0]=
2184         src[1]=
2185         src[2]=
2186         src[3]= src[0];
2187         srcStride[0]=
2188         srcStride[1]=
2189         srcStride[2]=
2190         srcStride[3]= srcStride[0];
2191     }
2192     srcStride[1]<<= c->vChrDrop;
2193     srcStride[2]<<= c->vChrDrop;
2194
2195     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2196                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2197                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2198     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2199                    srcSliceY,    srcSliceH,    dstY,    dstH);
2200     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2201                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2202
2203     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2204         static int warnedAlready=0; //FIXME move this into the context perhaps
2205         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2206             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2207                    "         ->cannot do aligned memory accesses anymore\n");
2208             warnedAlready=1;
2209         }
2210     }
2211
2212     /* Note the user might start scaling the picture in the middle so this
2213        will not get executed. This is not really intended but works
2214        currently, so people might do it. */
2215     if (srcSliceY ==0) {
2216         lumBufIndex=-1;
2217         chrBufIndex=-1;
2218         dstY=0;
2219         lastInLumBuf= -1;
2220         lastInChrBuf= -1;
2221     }
2222
2223     lastDstY= dstY;
2224
2225     for (;dstY < dstH; dstY++) {
2226         unsigned char *dest =dst[0]+dstStride[0]*dstY;
2227         const int chrDstY= dstY>>c->chrDstVSubSample;
2228         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2229         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2230         unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2231
2232         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2233         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2234         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2235         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2236         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2237         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2238         int enough_lines;
2239
2240         //handle holes (FAST_BILINEAR & weird filters)
2241         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2242         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2243         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2244         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2245
2246         DEBUG_BUFFERS("dstY: %d\n", dstY);
2247         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2248                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2249         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2250                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2251
2252         // Do we have enough lines in this slice to output the dstY line
2253         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2254
2255         if (!enough_lines) {
2256             lastLumSrcY = srcSliceY + srcSliceH - 1;
2257             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2258             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2259                                             lastLumSrcY, lastChrSrcY);
2260         }
2261
2262         //Do horizontal scaling
2263         while(lastInLumBuf < lastLumSrcY) {
2264             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2265             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2266             lumBufIndex++;
2267             assert(lumBufIndex < 2*vLumBufSize);
2268             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2269             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2270             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2271                     hLumFilter, hLumFilterPos, hLumFilterSize,
2272                     formatConvBuffer,
2273                     pal, 0);
2274             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2275                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2276                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2277                         formatConvBuffer,
2278                         pal, 1);
2279             lastInLumBuf++;
2280             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2281                                lumBufIndex,    lastInLumBuf);
2282         }
2283         while(lastInChrBuf < lastChrSrcY) {
2284             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2285             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2286             chrBufIndex++;
2287             assert(chrBufIndex < 2*vChrBufSize);
2288             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2289             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2290             //FIXME replace parameters through context struct (some at least)
2291
2292             if (c->needs_hcscale)
2293                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2294                           chrDstW, src1, src2, chrSrcW, chrXInc,
2295                           hChrFilter, hChrFilterPos, hChrFilterSize,
2296                           formatConvBuffer, pal);
2297             lastInChrBuf++;
2298             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2299                                chrBufIndex,    lastInChrBuf);
2300         }
2301         //wrap buf index around to stay inside the ring buffer
2302         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2303         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2304         if (!enough_lines)
2305             break; //we can't output a dstY line so let's try with the next slice
2306
2307 #if HAVE_MMX
2308         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2309 #endif
2310         if (dstY >= dstH-2) {
2311             // hmm looks like we can't use MMX here without overwriting this array's tail
2312             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2313                                            &yuv2packed1, &yuv2packed2,
2314                                            &yuv2packedX);
2315         }
2316
2317         {
2318             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2319             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2320             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2321             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2322             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2323                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2324                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2325                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2326                     const int16_t *lumBuf = lumSrcPtr[0];
2327                     const int16_t *chrUBuf= chrUSrcPtr[0];
2328                     const int16_t *chrVBuf= chrVSrcPtr[0];
2329                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2330                     yuv2yuv1(c, lumBuf, chrUBuf, chrVBuf, alpBuf, dest,
2331                                 uDest, vDest, aDest, dstW, chrDstW);
2332                 } else { //General YV12
2333                     yuv2yuvX(c,
2334                                 vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2335                                 vChrFilter+chrDstY*vChrFilterSize, chrUSrcPtr,
2336                                 chrVSrcPtr, vChrFilterSize,
2337                                 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2338                 }
2339             } else {
2340                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2341                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2342                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2343                     int chrAlpha= vChrFilter[2*dstY+1];
2344                     yuv2packed1(c, *lumSrcPtr, *chrUSrcPtr, *(chrUSrcPtr+1),
2345                                    *chrVSrcPtr, *(chrVSrcPtr+1),
2346                                    alpPixBuf ? *alpSrcPtr : NULL,
2347                                    dest, dstW, chrAlpha, dstFormat, flags, dstY);
2348                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2349                     int lumAlpha= vLumFilter[2*dstY+1];
2350                     int chrAlpha= vChrFilter[2*dstY+1];
2351                     lumMmxFilter[2]=
2352                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2353                     chrMmxFilter[2]=
2354                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2355                     yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrUSrcPtr, *(chrUSrcPtr+1),
2356                                    *chrVSrcPtr, *(chrVSrcPtr+1),
2357                                    alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2358                                    dest, dstW, lumAlpha, chrAlpha, dstY);
2359                 } else { //general RGB
2360                     yuv2packedX(c,
2361                                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2362                                    vChrFilter+dstY*vChrFilterSize, chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2363                                    alpSrcPtr, dest, dstW, dstY);
2364                 }
2365             }
2366         }
2367     }
2368
2369     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2370         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2371
2372 #if HAVE_MMX2
2373     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2374         __asm__ volatile("sfence":::"memory");
2375 #endif
2376     emms_c();
2377
2378     /* store changed local vars back in the context */
2379     c->dstY= dstY;
2380     c->lumBufIndex= lumBufIndex;
2381     c->chrBufIndex= chrBufIndex;
2382     c->lastInLumBuf= lastInLumBuf;
2383     c->lastInChrBuf= lastInChrBuf;
2384
2385     return dstY - lastDstY;
2386 }
2387
2388 static av_cold void sws_init_swScale_c(SwsContext *c)
2389 {
2390     enum PixelFormat srcFormat = c->srcFormat;
2391
2392     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2393                                    &c->yuv2packed1, &c->yuv2packed2,
2394                                    &c->yuv2packedX);
2395
2396     c->hScale       = hScale_c;
2397
2398     if (c->flags & SWS_FAST_BILINEAR) {
2399         c->hyscale_fast = hyscale_fast_c;
2400         c->hcscale_fast = hcscale_fast_c;
2401     }
2402
2403     c->chrToYV12 = NULL;
2404     switch(srcFormat) {
2405         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2406         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2407         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2408         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2409         case PIX_FMT_RGB8     :
2410         case PIX_FMT_BGR8     :
2411         case PIX_FMT_PAL8     :
2412         case PIX_FMT_BGR4_BYTE:
2413         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2414         case PIX_FMT_YUV444P9BE:
2415         case PIX_FMT_YUV420P9BE: c->chrToYV12 = BE9ToUV_c; break;
2416         case PIX_FMT_YUV444P9LE:
2417         case PIX_FMT_YUV420P9LE: c->chrToYV12 = LE9ToUV_c; break;
2418         case PIX_FMT_YUV444P10BE:
2419         case PIX_FMT_YUV422P10BE:
2420         case PIX_FMT_YUV420P10BE: c->chrToYV12 = BE10ToUV_c; break;
2421         case PIX_FMT_YUV422P10LE:
2422         case PIX_FMT_YUV444P10LE:
2423         case PIX_FMT_YUV420P10LE: c->chrToYV12 = LE10ToUV_c; break;
2424         case PIX_FMT_YUV420P16BE:
2425         case PIX_FMT_YUV422P16BE:
2426         case PIX_FMT_YUV444P16BE: c->chrToYV12 = BEToUV_c; break;
2427         case PIX_FMT_YUV420P16LE:
2428         case PIX_FMT_YUV422P16LE:
2429         case PIX_FMT_YUV444P16LE: c->chrToYV12 = LEToUV_c; break;
2430     }
2431     if (c->chrSrcHSubSample) {
2432         switch(srcFormat) {
2433         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2434         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2435         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2436         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2437         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2438         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2439         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2440         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2441         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2442         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2443         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2444         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2445         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2446         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2447         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2448         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2449         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2450         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2451         }
2452     } else {
2453         switch(srcFormat) {
2454         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2455         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2456         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2457         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2458         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2459         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2460         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2461         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2462         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2463         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2464         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2465         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2466         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2467         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2468         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2469         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2470         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2471         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2472         }
2473     }
2474
2475     c->lumToYV12 = NULL;
2476     c->alpToYV12 = NULL;
2477     switch (srcFormat) {
2478     case PIX_FMT_YUV444P9BE:
2479     case PIX_FMT_YUV420P9BE: c->lumToYV12 = BE9ToY_c; break;
2480     case PIX_FMT_YUV444P9LE:
2481     case PIX_FMT_YUV420P9LE: c->lumToYV12 = LE9ToY_c; break;
2482     case PIX_FMT_YUV444P10BE:
2483     case PIX_FMT_YUV422P10BE:
2484     case PIX_FMT_YUV420P10BE: c->lumToYV12 = BE10ToY_c; break;
2485     case PIX_FMT_YUV444P10LE:
2486     case PIX_FMT_YUV422P10LE:
2487     case PIX_FMT_YUV420P10LE: c->lumToYV12 = LE10ToY_c; break;
2488     case PIX_FMT_YUYV422  :
2489     case PIX_FMT_YUV420P16BE:
2490     case PIX_FMT_YUV422P16BE:
2491     case PIX_FMT_YUV444P16BE:
2492     case PIX_FMT_Y400A    :
2493     case PIX_FMT_GRAY16BE : c->lumToYV12 = yuy2ToY_c; break;
2494     case PIX_FMT_UYVY422  :
2495     case PIX_FMT_YUV420P16LE:
2496     case PIX_FMT_YUV422P16LE:
2497     case PIX_FMT_YUV444P16LE:
2498     case PIX_FMT_GRAY16LE : c->lumToYV12 = uyvyToY_c;    break;
2499     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2500     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2501     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2502     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2503     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2504     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2505     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2506     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2507     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2508     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2509     case PIX_FMT_RGB8     :
2510     case PIX_FMT_BGR8     :
2511     case PIX_FMT_PAL8     :
2512     case PIX_FMT_BGR4_BYTE:
2513     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2514     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2515     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2516     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2517     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2518     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2519     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2520     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2521     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2522     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2523     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2524     }
2525     if (c->alpPixBuf) {
2526         switch (srcFormat) {
2527         case PIX_FMT_BGRA:
2528         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2529         case PIX_FMT_ABGR:
2530         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2531         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2532         }
2533     }
2534
2535     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2536         if (c->srcRange) {
2537             c->lumConvertRange = lumRangeFromJpeg_c;
2538             c->chrConvertRange = chrRangeFromJpeg_c;
2539         } else {
2540             c->lumConvertRange = lumRangeToJpeg_c;
2541             c->chrConvertRange = chrRangeToJpeg_c;
2542         }
2543     }
2544
2545     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2546           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2547         c->needs_hcscale = 1;
2548 }
2549
2550 SwsFunc ff_getSwsFunc(SwsContext *c)
2551 {
2552     sws_init_swScale_c(c);
2553
2554     if (HAVE_MMX)
2555         ff_sws_init_swScale_mmx(c);
2556     if (HAVE_ALTIVEC)
2557         ff_sws_init_swScale_altivec(c);
2558
2559     return swScale;
2560 }