git.sesse.net Git - ffmpeg/blob - libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include <inttypes.h>
  22 #include <string.h>
  23 #include <math.h>
  24 #include <stdio.h>
  25 #include "config.h"
  26 #include <assert.h>
  27 #include "swscale.h"
  28 #include "swscale_internal.h"
  29 #include "rgb2rgb.h"
  30 #include "libavutil/avassert.h"
  31 #include "libavutil/intreadwrite.h"
  32 #include "libavutil/cpu.h"
  33 #include "libavutil/avutil.h"
  34 #include "libavutil/mathematics.h"
  35 #include "libavutil/bswap.h"
  36 #include "libavutil/pixdesc.h"
  37
  38 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
  39 {  36, 68, 60, 92, 34, 66, 58, 90,},
  40 { 100,  4,124, 28, 98,  2,122, 26,},
  41 {  52, 84, 44, 76, 50, 82, 42, 74,},
  42 { 116, 20,108, 12,114, 18,106, 10,},
  43 {  32, 64, 56, 88, 38, 70, 62, 94,},
  44 {  96,  0,120, 24,102,  6,126, 30,},
  45 {  48, 80, 40, 72, 54, 86, 46, 78,},
  46 { 112, 16,104,  8,118, 22,110, 14,},
  47 };
  48 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
  49 {  64, 64, 64, 64, 64, 64, 64, 64 };
  50
  51
  52 static av_always_inline void fillPlane(uint8_t* plane, int stride,
  53                                        int width, int height,
  54                                        int y, uint8_t val)
  55 {
  56     int i;
  57     uint8_t *ptr = plane + stride*y;
  58     for (i=0; i<height; i++) {
  59         memset(ptr, val, width);
  60         ptr += stride;
  61     }
  62 }
  63
  64 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
  65                            const int16_t *filter,
  66                            const int16_t *filterPos, int filterSize)
  67 {
  68     int i;
  69     int32_t *dst = (int32_t *) _dst;
  70     const uint16_t *src = (const uint16_t *) _src;
  71     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
  72     int sh = bits - 4;
  73
  74     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
  75         sh= 9;
  76
  77     for (i = 0; i < dstW; i++) {
  78         int j;
  79         int srcPos = filterPos[i];
  80         int val = 0;
  81
  82         for (j = 0; j < filterSize; j++) {
  83             val += src[srcPos + j] * filter[filterSize * i + j];
  84         }
  85         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
  86         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
  87     }
  88 }
  89
  90 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
  91                            const int16_t *filter,
  92                            const int16_t *filterPos, int filterSize)
  93 {
  94     int i;
  95     const uint16_t *src = (const uint16_t *) _src;
  96     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
  97
  98     if(sh<15)
  99         sh= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
 100
 101     for (i = 0; i < dstW; i++) {
 102         int j;
 103         int srcPos = filterPos[i];
 104         int val = 0;
 105
 106         for (j = 0; j < filterSize; j++) {
 107             val += src[srcPos + j] * filter[filterSize * i + j];
 108         }
 109         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
 110         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
 111     }
 112 }
 113
 114 // bilinear / bicubic scaling
 115 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
 116                           const int16_t *filter, const int16_t *filterPos,
 117                           int filterSize)
 118 {
 119     int i;
 120     for (i=0; i<dstW; i++) {
 121         int j;
 122         int srcPos= filterPos[i];
 123         int val=0;
 124         for (j=0; j<filterSize; j++) {
 125             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
 126         }
 127         //filter += hFilterSize;
 128         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
 129         //dst[i] = val>>7;
 130     }
 131 }
 132
 133 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
 134                           const int16_t *filter, const int16_t *filterPos,
 135                           int filterSize)
 136 {
 137     int i;
 138     int32_t *dst = (int32_t *) _dst;
 139     for (i=0; i<dstW; i++) {
 140         int j;
 141         int srcPos= filterPos[i];
 142         int val=0;
 143         for (j=0; j<filterSize; j++) {
 144             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
 145         }
 146         //filter += hFilterSize;
 147         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
 148         //dst[i] = val>>7;
 149     }
 150 }
 151
 152 //FIXME all pal and rgb srcFormats could do this convertion as well
 153 //FIXME all scalers more complex than bilinear could do half of this transform
 154 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
 155 {
 156     int i;
 157     for (i = 0; i < width; i++) {
 158         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
 159         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
 160     }
 161 }
 162 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
 163 {
 164     int i;
 165     for (i = 0; i < width; i++) {
 166         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
 167         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
 168     }
 169 }
 170 static void lumRangeToJpeg_c(int16_t *dst, int width)
 171 {
 172     int i;
 173     for (i = 0; i < width; i++)
 174         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
 175 }
 176 static void lumRangeFromJpeg_c(int16_t *dst, int width)
 177 {
 178     int i;
 179     for (i = 0; i < width; i++)
 180         dst[i] = (dst[i]*14071 + 33561947)>>14;
 181 }
 182
 183 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
 184 {
 185     int i;
 186     int32_t *dstU = (int32_t *) _dstU;
 187     int32_t *dstV = (int32_t *) _dstV;
 188     for (i = 0; i < width; i++) {
 189         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
 190         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
 191     }
 192 }
 193 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
 194 {
 195     int i;
 196     int32_t *dstU = (int32_t *) _dstU;
 197     int32_t *dstV = (int32_t *) _dstV;
 198     for (i = 0; i < width; i++) {
 199         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
 200         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
 201     }
 202 }
 203 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
 204 {
 205     int i;
 206     int32_t *dst = (int32_t *) _dst;
 207     for (i = 0; i < width; i++)
 208         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
 209 }
 210 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
 211 {
 212     int i;
 213     int32_t *dst = (int32_t *) _dst;
 214     for (i = 0; i < width; i++)
 215         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
 216 }
 217
 218 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
 219                            const uint8_t *src, int srcW, int xInc)
 220 {
 221     int i;
 222     unsigned int xpos=0;
 223     for (i=0;i<dstWidth;i++) {
 224         register unsigned int xx=xpos>>16;
 225         register unsigned int xalpha=(xpos&0xFFFF)>>9;
 226         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
 227         xpos+=xInc;
 228     }
 229     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
 230         dst[i] = src[srcW-1]*128;
 231 }
 232
 233 // *** horizontal scale Y line to temp buffer
 234 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
 235                                      const uint8_t *src_in[4], int srcW, int xInc,
 236                                      const int16_t *hLumFilter,
 237                                      const int16_t *hLumFilterPos, int hLumFilterSize,
 238                                      uint8_t *formatConvBuffer,
 239                                      uint32_t *pal, int isAlpha)
 240 {
 241     void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
 242     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
 243     const uint8_t *src = src_in[isAlpha ? 3 : 0];
 244
 245     if (toYV12) {
 246         toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
 247         src= formatConvBuffer;
 248     } else if (c->readLumPlanar && !isAlpha) {
 249         c->readLumPlanar(formatConvBuffer, src_in, srcW);
 250         src = formatConvBuffer;
 251     }
 252
 253     if (!c->hyscale_fast) {
 254         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
 255     } else { // fast bilinear upscale / crap downscale
 256         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
 257     }
 258
 259     if (convertRange)
 260         convertRange(dst, dstWidth);
 261 }
 262
 263 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
 264                            int dstWidth, const uint8_t *src1,
 265                            const uint8_t *src2, int srcW, int xInc)
 266 {
 267     int i;
 268     unsigned int xpos=0;
 269     for (i=0;i<dstWidth;i++) {
 270         register unsigned int xx=xpos>>16;
 271         register unsigned int xalpha=(xpos&0xFFFF)>>9;
 272         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
 273         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
 274         xpos+=xInc;
 275     }
 276     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
 277         dst1[i] = src1[srcW-1]*128;
 278         dst2[i] = src2[srcW-1]*128;
 279     }
 280 }
 281
 282 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
 283                                      const uint8_t *src_in[4],
 284                                      int srcW, int xInc, const int16_t *hChrFilter,
 285                                      const int16_t *hChrFilterPos, int hChrFilterSize,
 286                                      uint8_t *formatConvBuffer, uint32_t *pal)
 287 {
 288     const uint8_t *src1 = src_in[1], *src2 = src_in[2];
 289     if (c->chrToYV12) {
 290         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
 291         c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
 292         src1= formatConvBuffer;
 293         src2= buf2;
 294     } else if (c->readChrPlanar) {
 295         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
 296         c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW);
 297         src1= formatConvBuffer;
 298         src2= buf2;
 299     }
 300
 301     if (!c->hcscale_fast) {
 302         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
 303         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
 304     } else { // fast bilinear upscale / crap downscale
 305         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
 306     }
 307
 308     if (c->chrConvertRange)
 309         c->chrConvertRange(dst1, dst2, dstWidth);
 310 }
 311
 312 #define DEBUG_SWSCALE_BUFFERS 0
 313 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
 314
 315 static int swScale(SwsContext *c, const uint8_t* src[],
 316                    int srcStride[], int srcSliceY,
 317                    int srcSliceH, uint8_t* dst[], int dstStride[])
 318 {
 319     /* load a few things into local vars to make the code more readable? and faster */
 320     const int srcW= c->srcW;
 321     const int dstW= c->dstW;
 322     const int dstH= c->dstH;
 323     const int chrDstW= c->chrDstW;
 324     const int chrSrcW= c->chrSrcW;
 325     const int lumXInc= c->lumXInc;
 326     const int chrXInc= c->chrXInc;
 327     const enum PixelFormat dstFormat= c->dstFormat;
 328     const int flags= c->flags;
 329     int16_t *vLumFilterPos= c->vLumFilterPos;
 330     int16_t *vChrFilterPos= c->vChrFilterPos;
 331     int16_t *hLumFilterPos= c->hLumFilterPos;
 332     int16_t *hChrFilterPos= c->hChrFilterPos;
 333     int16_t *hLumFilter= c->hLumFilter;
 334     int16_t *hChrFilter= c->hChrFilter;
 335     int32_t *lumMmxFilter= c->lumMmxFilter;
 336     int32_t *chrMmxFilter= c->chrMmxFilter;
 337     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
 338     const int vLumFilterSize= c->vLumFilterSize;
 339     const int vChrFilterSize= c->vChrFilterSize;
 340     const int hLumFilterSize= c->hLumFilterSize;
 341     const int hChrFilterSize= c->hChrFilterSize;
 342     int16_t **lumPixBuf= c->lumPixBuf;
 343     int16_t **chrUPixBuf= c->chrUPixBuf;
 344     int16_t **chrVPixBuf= c->chrVPixBuf;
 345     int16_t **alpPixBuf= c->alpPixBuf;
 346     const int vLumBufSize= c->vLumBufSize;
 347     const int vChrBufSize= c->vChrBufSize;
 348     uint8_t *formatConvBuffer= c->formatConvBuffer;
 349     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
 350     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
 351     int lastDstY;
 352     uint32_t *pal=c->pal_yuv;
 353     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
 354
 355     yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
 356     yuv2planarX_fn yuv2planeX = c->yuv2planeX;
 357     yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
 358     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
 359     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
 360     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
 361
 362     /* vars which will change and which we need to store back in the context */
 363     int dstY= c->dstY;
 364     int lumBufIndex= c->lumBufIndex;
 365     int chrBufIndex= c->chrBufIndex;
 366     int lastInLumBuf= c->lastInLumBuf;
 367     int lastInChrBuf= c->lastInChrBuf;
 368
 369     if (isPacked(c->srcFormat)) {
 370         src[0]=
 371         src[1]=
 372         src[2]=
 373         src[3]= src[0];
 374         srcStride[0]=
 375         srcStride[1]=
 376         srcStride[2]=
 377         srcStride[3]= srcStride[0];
 378     }
 379     srcStride[1]<<= c->vChrDrop;
 380     srcStride[2]<<= c->vChrDrop;
 381
 382     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
 383                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
 384                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
 385     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
 386                    srcSliceY,    srcSliceH,    dstY,    dstH);
 387     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
 388                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
 389
 390     if (dstStride[0]%16 !=0 || dstStride[1]%16 !=0 || dstStride[2]%16 !=0 || dstStride[3]%16 != 0) {
 391         static int warnedAlready=0; //FIXME move this into the context perhaps
 392         if (flags & SWS_PRINT_INFO && !warnedAlready) {
 393             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
 394                    "         ->cannot do aligned memory accesses anymore\n");
 395             warnedAlready=1;
 396         }
 397     }
 398
 399     if ((int)dst[0]%16 || (int)dst[1]%16 || (int)dst[2]%16 || (int)src[0]%16 || (int)src[1]%16 || (int)src[2]%16
 400         || dstStride[0]%16 || dstStride[1]%16 || dstStride[2]%16 || dstStride[3]%16
 401         || srcStride[0]%16 || srcStride[1]%16 || srcStride[2]%16 || srcStride[3]%16
 402     ) {
 403         static int warnedAlready=0;
 404         int cpu_flags = av_get_cpu_flags();
 405         if (HAVE_MMX2 && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
 406             av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
 407             warnedAlready=1;
 408         }
 409     }
 410
 411     /* Note the user might start scaling the picture in the middle so this
 412        will not get executed. This is not really intended but works
 413        currently, so people might do it. */
 414     if (srcSliceY ==0) {
 415         lumBufIndex=-1;
 416         chrBufIndex=-1;
 417         dstY=0;
 418         lastInLumBuf= -1;
 419         lastInChrBuf= -1;
 420     }
 421
 422     if (!should_dither) {
 423         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
 424     }
 425     lastDstY= dstY;
 426
 427     for (;dstY < dstH; dstY++) {
 428         const int chrDstY= dstY>>c->chrDstVSubSample;
 429         uint8_t *dest[4] = {
 430             dst[0] + dstStride[0] * dstY,
 431             dst[1] + dstStride[1] * chrDstY,
 432             dst[2] + dstStride[2] * chrDstY,
 433             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
 434         };
 435         int use_mmx_vfilter= c->use_mmx_vfilter;
 436
 437         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
 438         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
 439         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
 440         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
 441         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
 442         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
 443         int enough_lines;
 444
 445         //handle holes (FAST_BILINEAR & weird filters)
 446         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
 447         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
 448         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
 449         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
 450
 451         DEBUG_BUFFERS("dstY: %d\n", dstY);
 452         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
 453                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
 454         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
 455                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
 456
 457         // Do we have enough lines in this slice to output the dstY line
 458         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
 459
 460         if (!enough_lines) {
 461             lastLumSrcY = srcSliceY + srcSliceH - 1;
 462             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
 463             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
 464                                             lastLumSrcY, lastChrSrcY);
 465         }
 466
 467         //Do horizontal scaling
 468         while(lastInLumBuf < lastLumSrcY) {
 469             const uint8_t *src1[4] = {
 470                 src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
 471                 src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
 472                 src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
 473                 src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
 474             };
 475             lumBufIndex++;
 476             assert(lumBufIndex < 2*vLumBufSize);
 477             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
 478             assert(lastInLumBuf + 1 - srcSliceY >= 0);
 479             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
 480                     hLumFilter, hLumFilterPos, hLumFilterSize,
 481                     formatConvBuffer,
 482                     pal, 0);
 483             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
 484                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src1, srcW,
 485                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
 486                         formatConvBuffer,
 487                         pal, 1);
 488             lastInLumBuf++;
 489             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
 490                                lumBufIndex,    lastInLumBuf);
 491         }
 492         while(lastInChrBuf < lastChrSrcY) {
 493             const uint8_t *src1[4] = {
 494                 src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
 495                 src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
 496                 src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
 497                 src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
 498             };
 499             chrBufIndex++;
 500             assert(chrBufIndex < 2*vChrBufSize);
 501             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
 502             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
 503             //FIXME replace parameters through context struct (some at least)
 504
 505             if (c->needs_hcscale)
 506                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
 507                           chrDstW, src1, chrSrcW, chrXInc,
 508                           hChrFilter, hChrFilterPos, hChrFilterSize,
 509                           formatConvBuffer, pal);
 510             lastInChrBuf++;
 511             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
 512                                chrBufIndex,    lastInChrBuf);
 513         }
 514         //wrap buf index around to stay inside the ring buffer
 515         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
 516         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
 517         if (!enough_lines)
 518             break; //we can't output a dstY line so let's try with the next slice
 519
 520 #if HAVE_MMX
 521         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
 522 #endif
 523         if (should_dither) {
 524             c->chrDither8 = dither_8x8_128[chrDstY & 7];
 525             c->lumDither8 = dither_8x8_128[dstY & 7];
 526         }
 527         if (dstY >= dstH-2) {
 528             // hmm looks like we can't use MMX here without overwriting this array's tail
 529             ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX,  &yuv2nv12cX,
 530                                      &yuv2packed1, &yuv2packed2, &yuv2packedX);
 531             use_mmx_vfilter= 0;
 532         }
 533
 534         {
 535             const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
 536             const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
 537             const int16_t **chrVSrcPtr= (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
 538             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
 539             int16_t *vLumFilter= c->vLumFilter;
 540             int16_t *vChrFilter= c->vChrFilter;
 541
 542             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
 543                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
 544
 545                 vLumFilter +=    dstY * vLumFilterSize;
 546                 vChrFilter += chrDstY * vChrFilterSize;
 547
 548 //                 av_assert0(use_mmx_vfilter != (
 549 //                                yuv2planeX == yuv2planeX_10BE_c
 550 //                             || yuv2planeX == yuv2planeX_10LE_c
 551 //                             || yuv2planeX == yuv2planeX_9BE_c
 552 //                             || yuv2planeX == yuv2planeX_9LE_c
 553 //                             || yuv2planeX == yuv2planeX_16BE_c
 554 //                             || yuv2planeX == yuv2planeX_16LE_c
 555 //                             || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
 556
 557                 if(use_mmx_vfilter){
 558                     vLumFilter= c->lumMmxFilter;
 559                     vChrFilter= c->chrMmxFilter;
 560                 }
 561
 562                 if (vLumFilterSize == 1) {
 563                     yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
 564                 } else {
 565                     yuv2planeX(vLumFilter, vLumFilterSize,
 566                                lumSrcPtr, dest[0], dstW, c->lumDither8, 0);
 567                 }
 568
 569                 if (!((dstY&chrSkipMask) || isGray(dstFormat))) {
 570                     if (yuv2nv12cX) {
 571                         yuv2nv12cX(c, vChrFilter, vChrFilterSize, chrUSrcPtr, chrVSrcPtr, dest[1], chrDstW);
 572                     } else if (vChrFilterSize == 1) {
 573                         yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
 574                         yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
 575                     } else {
 576                         yuv2planeX(vChrFilter, vChrFilterSize,
 577                                    chrUSrcPtr, dest[1], chrDstW, c->chrDither8, 0);
 578                         yuv2planeX(vChrFilter, vChrFilterSize,
 579                                    chrVSrcPtr, dest[2], chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
 580                     }
 581                 }
 582
 583                 if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
 584                     if(use_mmx_vfilter){
 585                         vLumFilter= c->alpMmxFilter;
 586                     }
 587                     if (vLumFilterSize == 1) {
 588                         yuv2plane1(alpSrcPtr[0], dest[3], dstW, c->lumDither8, 0);
 589                     } else {
 590                         yuv2planeX(vLumFilter, vLumFilterSize,
 591                                    alpSrcPtr, dest[3], dstW, c->lumDither8, 0);
 592                     }
 593                 }
 594             } else {
 595                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
 596                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
 597                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
 598                     int chrAlpha = vChrFilter[2 * dstY + 1];
 599                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
 600                                 alpPixBuf ? *alpSrcPtr : NULL,
 601                                 dest[0], dstW, chrAlpha, dstY);
 602                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
 603                     int lumAlpha = vLumFilter[2 * dstY + 1];
 604                     int chrAlpha = vChrFilter[2 * dstY + 1];
 605                     lumMmxFilter[2] =
 606                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
 607                     chrMmxFilter[2] =
 608                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
 609                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
 610                                 alpPixBuf ? alpSrcPtr : NULL,
 611                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
 612                 } else { //general RGB
 613                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
 614                                 lumSrcPtr, vLumFilterSize,
 615                                 vChrFilter + dstY * vChrFilterSize,
 616                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
 617                                 alpSrcPtr, dest[0], dstW, dstY);
 618                 }
 619             }
 620         }
 621     }
 622
 623     if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf)
 624         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
 625
 626 #if HAVE_MMX2
 627     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
 628         __asm__ volatile("sfence":::"memory");
 629 #endif
 630     emms_c();
 631
 632     /* store changed local vars back in the context */
 633     c->dstY= dstY;
 634     c->lumBufIndex= lumBufIndex;
 635     c->chrBufIndex= chrBufIndex;
 636     c->lastInLumBuf= lastInLumBuf;
 637     c->lastInChrBuf= lastInChrBuf;
 638
 639     return dstY - lastDstY;
 640 }
 641
 642 static av_cold void sws_init_swScale_c(SwsContext *c)
 643 {
 644     enum PixelFormat srcFormat = c->srcFormat;
 645
 646     ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
 647                              &c->yuv2nv12cX, &c->yuv2packed1,
 648                              &c->yuv2packed2, &c->yuv2packedX);
 649
 650     ff_sws_init_input_funcs(c);
 651
 652
 653     if (c->srcBpc == 8) {
 654         if (c->dstBpc <= 10) {
 655             c->hyScale = c->hcScale = hScale8To15_c;
 656             if (c->flags & SWS_FAST_BILINEAR) {
 657                 c->hyscale_fast = hyscale_fast_c;
 658                 c->hcscale_fast = hcscale_fast_c;
 659             }
 660         } else {
 661             c->hyScale = c->hcScale = hScale8To19_c;
 662         }
 663     } else {
 664         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
 665     }
 666
 667     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
 668         if (c->dstBpc <= 10) {
 669             if (c->srcRange) {
 670                 c->lumConvertRange = lumRangeFromJpeg_c;
 671                 c->chrConvertRange = chrRangeFromJpeg_c;
 672             } else {
 673                 c->lumConvertRange = lumRangeToJpeg_c;
 674                 c->chrConvertRange = chrRangeToJpeg_c;
 675             }
 676         } else {
 677             if (c->srcRange) {
 678                 c->lumConvertRange = lumRangeFromJpeg16_c;
 679                 c->chrConvertRange = chrRangeFromJpeg16_c;
 680             } else {
 681                 c->lumConvertRange = lumRangeToJpeg16_c;
 682                 c->chrConvertRange = chrRangeToJpeg16_c;
 683             }
 684         }
 685     }
 686
 687     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
 688           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
 689         c->needs_hcscale = 1;
 690 }
 691
 692 SwsFunc ff_getSwsFunc(SwsContext *c)
 693 {
 694     sws_init_swScale_c(c);
 695
 696     if (HAVE_MMX)
 697         ff_sws_init_swScale_mmx(c);
 698     if (HAVE_ALTIVEC)
 699         ff_sws_init_swScale_altivec(c);
 700
 701     return swScale;
 702 }