]> git.sesse.net Git - ffmpeg/blobdiff - libswscale/utils.c
Merge commit '81b9bf319226fe03436c80aaa8a2c91767cab7ce'
[ffmpeg] / libswscale / utils.c
index 7274153453b74df393ba378d824219f4e823dfe6..851965091a8d973a454aec3dced8656fda77eed4 100644 (file)
@@ -72,8 +72,6 @@ const char *swscale_license(void)
     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 }
 
-#define RET 0xC3 // near return opcode for x86
-
 typedef struct FormatEntry {
     uint8_t is_supported_in         :1;
     uint8_t is_supported_out        :1;
@@ -681,172 +679,6 @@ fail:
     return ret;
 }
 
-#if HAVE_MMXEXT_INLINE
-static av_cold int init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
-                                       int16_t *filter, int32_t *filterPos,
-                                       int numSplits)
-{
-    uint8_t *fragmentA;
-    x86_reg imm8OfPShufW1A;
-    x86_reg imm8OfPShufW2A;
-    x86_reg fragmentLengthA;
-    uint8_t *fragmentB;
-    x86_reg imm8OfPShufW1B;
-    x86_reg imm8OfPShufW2B;
-    x86_reg fragmentLengthB;
-    int fragmentPos;
-
-    int xpos, i;
-
-    // create an optimized horizontal scaling routine
-    /* This scaler is made of runtime-generated MMXEXT code using specially tuned
-     * pshufw instructions. For every four output pixels, if four input pixels
-     * are enough for the fast bilinear scaling, then a chunk of fragmentB is
-     * used. If five input pixels are needed, then a chunk of fragmentA is used.
-     */
-
-    // code fragment
-
-    __asm__ volatile (
-        "jmp                         9f                 \n\t"
-        // Begin
-        "0:                                             \n\t"
-        "movq    (%%"REG_d", %%"REG_a"), %%mm3          \n\t"
-        "movd    (%%"REG_c", %%"REG_S"), %%mm0          \n\t"
-        "movd   1(%%"REG_c", %%"REG_S"), %%mm1          \n\t"
-        "punpcklbw                %%mm7, %%mm1          \n\t"
-        "punpcklbw                %%mm7, %%mm0          \n\t"
-        "pshufw                   $0xFF, %%mm1, %%mm1   \n\t"
-        "1:                                             \n\t"
-        "pshufw                   $0xFF, %%mm0, %%mm0   \n\t"
-        "2:                                             \n\t"
-        "psubw                    %%mm1, %%mm0          \n\t"
-        "movl   8(%%"REG_b", %%"REG_a"), %%esi          \n\t"
-        "pmullw                   %%mm3, %%mm0          \n\t"
-        "psllw                       $7, %%mm1          \n\t"
-        "paddw                    %%mm1, %%mm0          \n\t"
-
-        "movq                     %%mm0, (%%"REG_D", %%"REG_a") \n\t"
-
-        "add                         $8, %%"REG_a"      \n\t"
-        // End
-        "9:                                             \n\t"
-        // "int $3                                         \n\t"
-        "lea       " LOCAL_MANGLE(0b) ", %0             \n\t"
-        "lea       " LOCAL_MANGLE(1b) ", %1             \n\t"
-        "lea       " LOCAL_MANGLE(2b) ", %2             \n\t"
-        "dec                         %1                 \n\t"
-        "dec                         %2                 \n\t"
-        "sub                         %0, %1             \n\t"
-        "sub                         %0, %2             \n\t"
-        "lea       " LOCAL_MANGLE(9b) ", %3             \n\t"
-        "sub                         %0, %3             \n\t"
-
-
-        : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
-          "=r" (fragmentLengthA)
-        );
-
-    __asm__ volatile (
-        "jmp                         9f                 \n\t"
-        // Begin
-        "0:                                             \n\t"
-        "movq    (%%"REG_d", %%"REG_a"), %%mm3          \n\t"
-        "movd    (%%"REG_c", %%"REG_S"), %%mm0          \n\t"
-        "punpcklbw                %%mm7, %%mm0          \n\t"
-        "pshufw                   $0xFF, %%mm0, %%mm1   \n\t"
-        "1:                                             \n\t"
-        "pshufw                   $0xFF, %%mm0, %%mm0   \n\t"
-        "2:                                             \n\t"
-        "psubw                    %%mm1, %%mm0          \n\t"
-        "movl   8(%%"REG_b", %%"REG_a"), %%esi          \n\t"
-        "pmullw                   %%mm3, %%mm0          \n\t"
-        "psllw                       $7, %%mm1          \n\t"
-        "paddw                    %%mm1, %%mm0          \n\t"
-
-        "movq                     %%mm0, (%%"REG_D", %%"REG_a") \n\t"
-
-        "add                         $8, %%"REG_a"      \n\t"
-        // End
-        "9:                                             \n\t"
-        // "int                       $3                   \n\t"
-        "lea       " LOCAL_MANGLE(0b) ", %0             \n\t"
-        "lea       " LOCAL_MANGLE(1b) ", %1             \n\t"
-        "lea       " LOCAL_MANGLE(2b) ", %2             \n\t"
-        "dec                         %1                 \n\t"
-        "dec                         %2                 \n\t"
-        "sub                         %0, %1             \n\t"
-        "sub                         %0, %2             \n\t"
-        "lea       " LOCAL_MANGLE(9b) ", %3             \n\t"
-        "sub                         %0, %3             \n\t"
-
-
-        : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
-          "=r" (fragmentLengthB)
-        );
-
-    xpos        = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
-    fragmentPos = 0;
-
-    for (i = 0; i < dstW / numSplits; i++) {
-        int xx = xpos >> 16;
-
-        if ((i & 3) == 0) {
-            int a                  = 0;
-            int b                  = ((xpos + xInc) >> 16) - xx;
-            int c                  = ((xpos + xInc * 2) >> 16) - xx;
-            int d                  = ((xpos + xInc * 3) >> 16) - xx;
-            int inc                = (d + 1 < 4);
-            uint8_t *fragment      = inc ? fragmentB : fragmentA;
-            x86_reg imm8OfPShufW1  = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
-            x86_reg imm8OfPShufW2  = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
-            x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
-            int maxShift           = 3 - (d + inc);
-            int shift              = 0;
-
-            if (filterCode) {
-                filter[i]        = ((xpos              & 0xFFFF) ^ 0xFFFF) >> 9;
-                filter[i + 1]    = (((xpos + xInc)     & 0xFFFF) ^ 0xFFFF) >> 9;
-                filter[i + 2]    = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
-                filter[i + 3]    = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
-                filterPos[i / 2] = xx;
-
-                memcpy(filterCode + fragmentPos, fragment, fragmentLength);
-
-                filterCode[fragmentPos + imm8OfPShufW1] =  (a + inc)       |
-                                                          ((b + inc) << 2) |
-                                                          ((c + inc) << 4) |
-                                                          ((d + inc) << 6);
-                filterCode[fragmentPos + imm8OfPShufW2] =  a | (b << 2) |
-                                                               (c << 4) |
-                                                               (d << 6);
-
-                if (i + 4 - inc >= dstW)
-                    shift = maxShift;               // avoid overread
-                else if ((filterPos[i / 2] & 3) <= maxShift)
-                    shift = filterPos[i / 2] & 3;   // align
-
-                if (shift && i >= shift) {
-                    filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
-                    filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
-                    filterPos[i / 2]                        -= shift;
-                }
-            }
-
-            fragmentPos += fragmentLength;
-
-            if (filterCode)
-                filterCode[fragmentPos] = RET;
-        }
-        xpos += xInc;
-    }
-    if (filterCode)
-        filterPos[((i / 2) + 1) & (~1)] = xpos >> 16;  // needed to jump to the next part
-
-    return fragmentPos + 1;
-}
-#endif /* HAVE_MMXEXT_INLINE */
-
 static void fill_rgb2yuv_table(SwsContext *c, const int table[4], int dstRange)
 {
     int64_t W, V, Z, Cy, Cu, Cv;
@@ -1006,7 +838,7 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
 
     //The srcBpc check is possibly wrong but we seem to lack a definitive reference to test this
     //and what we have in ticket 2939 looks better with this check
-    if (need_reinit && c->srcBpc == 8)
+    if (need_reinit && (c->srcBpc == 8 || !isYUV(c->srcFormat)))
         ff_sws_init_range_convert(c);
 
     if ((isYUV(c->dstFormat) || isGray(c->dstFormat)) && (isYUV(c->srcFormat) || isGray(c->srcFormat)))
@@ -1400,9 +1232,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
 #if HAVE_MMXEXT_INLINE
 // can't downscale !!!
         if (c->canMMXEXTBeUsed && (flags & SWS_FAST_BILINEAR)) {
-            c->lumMmxextFilterCodeSize = init_hscaler_mmxext(dstW, c->lumXInc, NULL,
+            c->lumMmxextFilterCodeSize = ff_init_hscaler_mmxext(dstW, c->lumXInc, NULL,
                                                              NULL, NULL, 8);
-            c->chrMmxextFilterCodeSize = init_hscaler_mmxext(c->chrDstW, c->chrXInc,
+            c->chrMmxextFilterCodeSize = ff_init_hscaler_mmxext(c->chrDstW, c->chrXInc,
                                                              NULL, NULL, NULL, 4);
 
 #if USE_MMAP
@@ -1443,9 +1275,9 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
             FF_ALLOCZ_OR_GOTO(c, c->hLumFilterPos, (dstW       / 2 / 8 + 8) * sizeof(int32_t), fail);
             FF_ALLOCZ_OR_GOTO(c, c->hChrFilterPos, (c->chrDstW / 2 / 4 + 8) * sizeof(int32_t), fail);
 
-            init_hscaler_mmxext(      dstW, c->lumXInc, c->lumMmxextFilterCode,
+            ff_init_hscaler_mmxext(      dstW, c->lumXInc, c->lumMmxextFilterCode,
                                 c->hLumFilter, (uint32_t*)c->hLumFilterPos, 8);
-            init_hscaler_mmxext(c->chrDstW, c->chrXInc, c->chrMmxextFilterCode,
+            ff_init_hscaler_mmxext(c->chrDstW, c->chrXInc, c->chrMmxextFilterCode,
                                 c->hChrFilter, (uint32_t*)c->hChrFilterPos, 4);
 
 #if USE_MMAP