swscale: clip unscaled colorspace conversion path.

[ffmpeg] / libswscale / swscale.c
diff --git a/libswscale/swscale.c b/libswscale/swscale.c

index 14807fdc4686878ae15d5f1c0dff24dc68575c33..dd9f4a108f3904c31c8994624234c3f64a55ecab 100644 (file)
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -182,6 +182,18 @@ DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
  { 77,  23,  60,  15,  72,  21,  56,  14, },
  };
  #endif
+DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
+{  36, 68, 60, 92, 34, 66, 58, 90,},
+{ 100,  4,124, 28, 98,  2,122, 26,},
+{  52, 84, 44, 76, 50, 82, 42, 74,},
+{ 116, 20,108, 12,114, 18,106, 10,},
+{  32, 64, 56, 88, 38, 70, 62, 94,},
+{  96,  0,120, 24,102,  6,126, 30,},
+{  48, 80, 40, 72, 54, 86, 46, 78,},
+{ 112, 16,104,  8,118, 22,110, 14,},
+};
+DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
+{  64, 64, 64, 64, 64, 64, 64, 64 };
  
  static av_always_inline void
  yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
@@ -195,7 +207,7 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
      int i;
      uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
               *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
-    int shift = 15 + 16 - output_bits;
+    int shift = 15 + 16 - output_bits - 1;
  
  #define output_pixel(pos, val) \
      if (big_endian) { \
@@ -212,24 +224,24 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
          } \
      }
      for (i = 0; i < dstW; i++) {
-        int val = 1 << (30-output_bits);
+        int val = 1 << (30-output_bits - 1);
          int j;
  
          for (j = 0; j < lumFilterSize; j++)
-            val += lumSrc[j][i] * lumFilter[j];
+            val += (lumSrc[j][i] * lumFilter[j]) >> 1;
  
          output_pixel(&yDest[i], val);
      }
  
      if (uDest) {
          for (i = 0; i < chrDstW; i++) {
-            int u = 1 << (30-output_bits);
-            int v = 1 << (30-output_bits);
+            int u = 1 << (30-output_bits - 1);
+            int v = 1 << (30-output_bits - 1);
              int j;
  
              for (j = 0; j < chrFilterSize; j++) {
-                u += chrUSrc[j][i] * chrFilter[j];
-                v += chrVSrc[j][i] * chrFilter[j];
+                u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
+                v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
              }
  
              output_pixel(&uDest[i], u);
@@ -239,11 +251,11 @@ yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
  
      if (CONFIG_SWSCALE_ALPHA && aDest) {
          for (i = 0; i < dstW; i++) {
-            int val = 1 << (30-output_bits);
+            int val = 1 << (30-output_bits - 1);
              int j;
  
              for (j = 0; j < lumFilterSize; j++)
-                val += alpSrc[j][i] * lumFilter[j];
+                val += (alpSrc[j][i] * lumFilter[j]) >> 1;
  
              output_pixel(&aDest[i], val);
          }
@@ -285,10 +297,11 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
      uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
      int i;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  
      //FIXME Optimize (just quickly written not optimized..)
      for (i=0; i<dstW; i++) {
-        int val=1<<18;
+        int val = lumDither[i & 7] << 12;
          int j;
          for (j=0; j<lumFilterSize; j++)
              val += lumSrc[j][i] * lumFilter[j];
@@ -298,8 +311,8 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
  
      if (uDest)
          for (i=0; i<chrDstW; i++) {
-            int u=1<<18;
-            int v=1<<18;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
              int j;
              for (j=0; j<chrFilterSize; j++) {
                  u += chrUSrc[j][i] * chrFilter[j];
@@ -312,7 +325,7 @@ static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
  
      if (CONFIG_SWSCALE_ALPHA && aDest)
          for (i=0; i<dstW; i++) {
-            int val=1<<18;
+            int val = lumDither[i & 7] << 12;
              int j;
              for (j=0; j<lumFilterSize; j++)
                  val += alpSrc[j][i] * lumFilter[j];
@@ -329,23 +342,24 @@ static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
      uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
      int i;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  
      for (i=0; i<dstW; i++) {
-        int val= (lumSrc[i]+64)>>7;
+        int val = (lumSrc[i]+  lumDither[i & 7]) >> 7;
          yDest[i]= av_clip_uint8(val);
      }
  
      if (uDest)
          for (i=0; i<chrDstW; i++) {
-            int u=(chrUSrc[i]+64)>>7;
-            int v=(chrVSrc[i]+64)>>7;
+            int u = (chrUSrc[i] + chrDither[i & 7])       >> 7;
+            int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7;
              uDest[i]= av_clip_uint8(u);
              vDest[i]= av_clip_uint8(v);
          }
  
      if (CONFIG_SWSCALE_ALPHA && aDest)
          for (i=0; i<dstW; i++) {
-            int val= (alpSrc[i]+64)>>7;
+            int val = (alpSrc[i] + lumDither[i & 7]) >> 7;
              aDest[i]= av_clip_uint8(val);
          }
  }
@@ -359,11 +373,12 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
  {
      uint8_t *yDest = dest[0], *uDest = dest[1];
      enum PixelFormat dstFormat = c->dstFormat;
+    const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
  
      //FIXME Optimize (just quickly written not optimized..)
      int i;
      for (i=0; i<dstW; i++) {
-        int val=1<<18;
+        int val = lumDither[i & 7] << 12;
          int j;
          for (j=0; j<lumFilterSize; j++)
              val += lumSrc[j][i] * lumFilter[j];
@@ -376,8 +391,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
  
      if (dstFormat == PIX_FMT_NV12)
          for (i=0; i<chrDstW; i++) {
-            int u=1<<18;
-            int v=1<<18;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
              int j;
              for (j=0; j<chrFilterSize; j++) {
                  u += chrUSrc[j][i] * chrFilter[j];
@@ -389,8 +404,8 @@ static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
          }
      else
          for (i=0; i<chrDstW; i++) {
-            int u=1<<18;
-            int v=1<<18;
+            int u = chrDither[i & 7] << 12;
+            int v = chrDither[(i + 3) & 7] << 12;
              int j;
              for (j=0; j<chrFilterSize; j++) {
                  u += chrUSrc[j][i] * chrFilter[j];
@@ -1136,7 +1151,8 @@ yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
      const int16_t *buf0  = buf[0],  *buf1  = buf[1],
                    *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
                    *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
-                  *abuf0 = abuf[0], *abuf1 = abuf[1];
+                  *abuf0 = hasAlpha ? abuf[0] : NULL,
+                  *abuf1 = hasAlpha ? abuf[1] : NULL;
      int  yalpha1 = 4095 - yalpha;
      int uvalpha1 = 4095 - uvalpha;
      int i;
@@ -1854,7 +1870,7 @@ static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_s
      for (i = 0; i < dstW; i++) {
          int j;
          int srcPos = filterPos[i];
-        unsigned int val = 0;
+        int val = 0;
  
          for (j = 0; j < filterSize; j++) {
              val += src[srcPos + j] * filter[filterSize * i + j];
@@ -2351,6 +2367,7 @@ static int swScale(SwsContext *c, const uint8_t* src[],
      yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
      yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
      yuv2packedX_fn yuv2packedX = c->yuv2packedX;
+    int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat);
  
      /* vars which will change and which we need to store back in the context */
      int dstY= c->dstY;
@@ -2400,6 +2417,9 @@ static int swScale(SwsContext *c, const uint8_t* src[],
          lastInChrBuf= -1;
      }
  
+    if (!should_dither) {
+        c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
+    }
      lastDstY= dstY;
  
      for (;dstY < dstH; dstY++) {
@@ -2489,6 +2509,10 @@ static int swScale(SwsContext *c, const uint8_t* src[],
  #if HAVE_MMX
          updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
  #endif
+        if (should_dither) {
+            c->chrDither8 = dither_8x8_128[chrDstY & 7];
+            c->lumDither8 = dither_8x8_128[dstY & 7];
+        }
          if (dstY >= dstH-2) {
              // hmm looks like we can't use MMX here without overwriting this array's tail
              find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,