]> git.sesse.net Git - vlc/blobdiff - modules/codec/avcodec/copy.c
avcodec: check if we can feed variable size frames to audio encoder
[vlc] / modules / codec / avcodec / copy.c
index 1573b93a2cafe278e21d4bce0fb6eb1f4c6f5ff9..e0823aaf69edecd3a7147dcc0e059b0b0602e6a1 100644 (file)
@@ -6,19 +6,19 @@
  *
  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  *****************************************************************************/
 
 #ifdef HAVE_CONFIG_H
 
 #include "copy.h"
 
-/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
- * storing data with the SSE>=2 instruction store.
+int CopyInitCache(copy_cache_t *cache, unsigned width)
+{
+#ifdef CAN_COMPILE_SSE2
+    cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
+    cache->buffer = vlc_memalign(16, cache->size);
+    if (!cache->buffer)
+        return VLC_EGENERIC;
+#else
+    (void) cache; (void) width;
+#endif
+    return VLC_SUCCESS;
+}
+
+void CopyCleanCache(copy_cache_t *cache)
+{
+#ifdef CAN_COMPILE_SSE2
+    vlc_free(cache->buffer);
+    cache->buffer = NULL;
+    cache->size   = 0;
+#else
+    (void) cache;
+#endif
+}
+
+#ifdef CAN_COMPILE_SSE2
+/* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
+ * load and storing data with the SSE>=2 instruction store.
  */
 #define COPY64(dstp, srcp, load, store) \
     asm volatile (                      \
         store " %%xmm2,   16(%[dst])\n" \
         store " %%xmm3,   32(%[dst])\n" \
         store " %%xmm4,   48(%[dst])\n" \
-        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
+        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
 
-/* Execute the instruction op only if SSE2 is supported. */
-#ifdef CAN_COMPILE_SSE2
-# ifdef __SSE2__
-#  define ASM_SSE2(cpu, op) asm volatile (op)
-# else
-#  define ASM_SSE2(cpu, op) do { \
-    if (cpu & VLC_CPU_SSE2) \
-        asm volatile (op); \
-    } while (0)
-#  undef vlc_CPU_SSE2
-#  define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
-# endif
-#else
-# define ASM_SSE2(cpu, op)
+#ifndef __SSE4_1__
+# undef vlc_CPU_SSE4_1
+# define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
+#endif
+
+#ifndef __SSSE3__
+# undef vlc_CPU_SSSE3
+# define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
+#endif
+
+#ifndef __SSE2__
+# undef vlc_CPU_SSE2
+# define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
 #endif
 
 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
  * as used by some video surface.
  * XXX It is really efficient only when SSE4.1 is available.
  */
+VLC_SSE
 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
                          const uint8_t *src, size_t src_pitch,
                          unsigned width, unsigned height,
@@ -74,16 +99,17 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 {
     assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
 
-    ASM_SSE2(cpu, "mfence");
+    asm volatile ("mfence");
+
     for (unsigned y = 0; y < height; y++) {
-        const unsigned unaligned = (intptr_t)src & 0x0f;
-        unsigned x;
+        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
+        unsigned x = 0;
 
-        for (x = 0; x < unaligned; x++)
+        for (; x < unaligned; x++)
             dst[x] = src[x];
 
 #ifdef CAN_COMPILE_SSE4_1
-        if (cpu & CPU_CAPABILITY_SSE4_1) {
+        if (vlc_CPU_SSE4_1()) {
             if (!unaligned) {
                 for (; x+63 < width; x += 64)
                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
@@ -93,8 +119,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
             }
         } else
 #endif
-#ifdef CAN_COMPILE_SSE2
-        if (vlc_CPU_SSE2()) {
+        {
             if (!unaligned) {
                 for (; x+63 < width; x += 64)
                     COPY64(&dst[x], &src[x], "movdqa", "movdqa");
@@ -103,7 +128,6 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
             }
         }
-#endif
 
         for (; x < width; x++)
             dst[x] = src[x];
@@ -113,30 +137,26 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
     }
 }
 
+VLC_SSE
 static void Copy2d(uint8_t *dst, size_t dst_pitch,
                    const uint8_t *src, size_t src_pitch,
-                   unsigned width, unsigned height,
-                   unsigned cpu)
+                   unsigned width, unsigned height)
 {
     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 
-    ASM_SSE2(cpu, "mfence");
+    asm volatile ("mfence");
 
     for (unsigned y = 0; y < height; y++) {
         unsigned x = 0;
-        bool unaligned = ((intptr_t)dst & 0x0f) != 0;
 
-#ifdef CAN_COMPILE_SSE2
-        if (vlc_CPU_SSE2()) {
-            if (!unaligned) {
-                for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movdqa", "movntdq");
-            } else {
-                for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
-            }
+        bool unaligned = ((intptr_t)dst & 0x0f) != 0;
+        if (!unaligned) {
+            for (; x+63 < width; x += 64)
+                COPY64(&dst[x], &src[x], "movdqa", "movntdq");
+        } else {
+            for (; x+63 < width; x += 64)
+                COPY64(&dst[x], &src[x], "movdqa", "movdqu");
         }
-#endif
 
         for (; x < width; x++)
             dst[x] = src[x];
@@ -146,10 +166,11 @@ static void Copy2d(uint8_t *dst, size_t dst_pitch,
     }
 }
 
-static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
-                    uint8_t *dstv, size_t dstv_pitch,
-                    const uint8_t *src, size_t src_pitch,
-                    unsigned width, unsigned height, unsigned cpu)
+VLC_SSE
+static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
+                        uint8_t *dstv, size_t dstv_pitch,
+                        const uint8_t *src, size_t src_pitch,
+                        unsigned width, unsigned height, unsigned cpu)
 {
     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
                                 1, 3, 5, 7, 9, 11, 13, 15 };
@@ -158,7 +179,7 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
 
     assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 
-    ASM_SSE2(cpu, "mfence");
+    asm volatile ("mfence");
 
     for (unsigned y = 0; y < height; y++) {
         unsigned x = 0;
@@ -180,7 +201,8 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
     "movhpd %%xmm3,  24(%[dst2])\n"
 
 #ifdef CAN_COMPILE_SSSE3
-        if (cpu & CPU_CAPABILITY_SSSE3) {
+        if (vlc_CPU_SSSE3())
+        {
             for (x = 0; x < (width & ~31); x += 32) {
                 asm volatile (
                     "movdqu (%[shuffle]), %%xmm7\n"
@@ -190,12 +212,11 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
                     "pshufb  %%xmm7, %%xmm2\n"
                     "pshufb  %%xmm7, %%xmm3\n"
                     STORE2X32
-                    : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
+                    : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
             }
         } else
 #endif
-#ifdef CAN_COMPILE_SSE2
-        if (vlc_CPU_SSE2()) {
+        {
             for (x = 0; x < (width & ~31); x += 32) {
                 asm volatile (
                     "movdqu (%[mask]), %%xmm7\n"
@@ -216,10 +237,9 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
                     "packuswb %%xmm6, %%xmm2\n"
                     "packuswb %%xmm7, %%xmm3\n"
                     STORE2X32
-                    : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
+                    : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
             }
         }
-#endif
 #undef STORE2X32
 #undef LOAD64
 
@@ -233,10 +253,10 @@ static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
     }
 }
 
-static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
-                      uint8_t *cache, size_t cache_size,
-                      unsigned width, unsigned height,
-                      unsigned cpu)
+static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
+                          const uint8_t *src, size_t src_pitch,
+                          uint8_t *cache, size_t cache_size,
+                          unsigned width, unsigned height, unsigned cpu)
 {
     const unsigned w16 = (width+15) & ~15;
     const unsigned hstep = cache_size / w16;
@@ -253,21 +273,20 @@ static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t
         /* Copy from our cache to the destination */
         Copy2d(dst, dst_pitch,
                cache, w16,
-               width, hblock, cpu);
+               width, hblock);
 
         /* */
         src += src_pitch * hblock;
         dst += dst_pitch * hblock;
     }
-
-    ASM_SSE2(cpu, "mfence");
+    asm volatile ("mfence");
 }
-static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
-                        uint8_t *dstv, size_t dstv_pitch,
-                        const uint8_t *src, size_t src_pitch,
-                        uint8_t *cache, size_t cache_size,
-                        unsigned width, unsigned height,
-                        unsigned cpu)
+
+static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+                            uint8_t *dstv, size_t dstv_pitch,
+                            const uint8_t *src, size_t src_pitch,
+                            uint8_t *cache, size_t cache_size,
+                            unsigned width, unsigned height, unsigned cpu)
 {
     const unsigned w2_16 = (2*width+15) & ~15;
     const unsigned hstep = cache_size / w2_16;
@@ -277,76 +296,121 @@ static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
         const unsigned hblock =  __MIN(hstep, height - y);
 
         /* Copy a bunch of line into our cache */
-        CopyFromUswc(cache, w2_16,
-                     src, src_pitch,
+        CopyFromUswc(cache, w2_16, src, src_pitch,
                      2*width, hblock, cpu);
 
         /* Copy from our cache to the destination */
-        SplitUV(dstu, dstu_pitch,
-                dstv, dstv_pitch,
-                cache, w2_16,
-                width, hblock, cpu);
+        SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
+                    cache, w2_16, width, hblock, cpu);
 
         /* */
         src  += src_pitch  * hblock;
         dstu += dstu_pitch * hblock;
         dstv += dstv_pitch * hblock;
     }
+    asm volatile ("mfence");
+}
 
-    ASM_SSE2(cpu, "mfence");
+static void SSE_CopyFromNv12(picture_t *dst,
+                             uint8_t *src[2], size_t src_pitch[2],
+                             unsigned width, unsigned height,
+                             copy_cache_t *cache, unsigned cpu)
+{
+    SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+                  src[0], src_pitch[0],
+                  cache->buffer, cache->size,
+                  width, height, cpu);
+    SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
+                    dst->p[1].p_pixels, dst->p[1].i_pitch,
+                    src[1], src_pitch[1],
+                    cache->buffer, cache->size,
+                    width/2, height/2, cpu);
+    asm volatile ("emms");
 }
 
-int CopyInitCache(copy_cache_t *cache, unsigned width)
+static void SSE_CopyFromYv12(picture_t *dst,
+                             uint8_t *src[3], size_t src_pitch[3],
+                             unsigned width, unsigned height,
+                             copy_cache_t *cache, unsigned cpu)
 {
-    cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
-    cache->buffer = vlc_memalign(16, cache->size);
-    if (!cache->buffer)
-        return VLC_EGENERIC;
-    return VLC_SUCCESS;
+    for (unsigned n = 0; n < 3; n++) {
+        const unsigned d = n > 0 ? 2 : 1;
+        SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
+                      src[n], src_pitch[n],
+                      cache->buffer, cache->size,
+                      width/d, height/d, cpu);
+    }
+    asm volatile ("emms");
 }
-void CopyCleanCache(copy_cache_t *cache)
+#undef COPY64
+#endif /* CAN_COMPILE_SSE2 */
+
+static void CopyPlane(uint8_t *dst, size_t dst_pitch,
+                      const uint8_t *src, size_t src_pitch,
+                      unsigned width, unsigned height)
 {
-    vlc_free(cache->buffer);
-    cache->buffer = NULL;
-    cache->size   = 0;
+    for (unsigned y = 0; y < height; y++) {
+        memcpy(dst, src, width);
+        src += src_pitch;
+        dst += dst_pitch;
+    }
+}
+
+static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+                        uint8_t *dstv, size_t dstv_pitch,
+                        const uint8_t *src, size_t src_pitch,
+                        unsigned width, unsigned height)
+{
+    for (unsigned y = 0; y < height; y++) {
+        for (unsigned x = 0; x < width; x++) {
+            dstu[x] = src[2*x+0];
+            dstv[x] = src[2*x+1];
+        }
+        src  += src_pitch;
+        dstu += dstu_pitch;
+        dstv += dstv_pitch;
+    }
 }
 
 void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
                   unsigned width, unsigned height,
                   copy_cache_t *cache)
 {
-    const unsigned cpu = vlc_CPU();
+#ifdef CAN_COMPILE_SSE2
+    unsigned cpu = vlc_CPU();
+    if (vlc_CPU_SSE2())
+        return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
+                                cache, cpu);
+#else
+    (void) cache;
+#endif
 
-    /* */
     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
               src[0], src_pitch[0],
-              cache->buffer, cache->size,
-              width, height, cpu);
+              width, height);
     SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
                 dst->p[1].p_pixels, dst->p[1].i_pitch,
                 src[1], src_pitch[1],
-                cache->buffer, cache->size,
-                width/2, height/2, cpu);
-
-    ASM_SSE2(cpu, "emms");
+                width/2, height/2);
 }
+
 void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
                   unsigned width, unsigned height,
                   copy_cache_t *cache)
 {
-    const unsigned cpu = vlc_CPU();
+#ifdef CAN_COMPILE_SSE2
+    unsigned cpu = vlc_CPU();
+    if (vlc_CPU_SSE2())
+        return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
+                                cache, cpu);
+#else
+    (void) cache;
+#endif
 
-    /* */
-    for (unsigned n = 0; n < 3; n++) {
-        const unsigned d = n > 0 ? 2 : 1;
-        CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
-                  src[n], src_pitch[n],
-                  cache->buffer, cache->size,
-                  width/d, height/d, cpu);
-    }
-    ASM_SSE2(cpu, "emms");
+     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+               src[0], src_pitch[0], width, height);
+     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
+               src[1], src_pitch[1], width / 2, height / 2);
+     CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
+               src[1], src_pitch[2], width / 2, height / 2);
 }
-
-#undef ASM_SSE2
-#undef COPY64
-