]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/x86/mpegaudiodec_mmx.c
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libavcodec / x86 / mpegaudiodec_mmx.c
index 980faf9cde14b831d08281d3d1baba07cfe239cb..939b441277675c0c8fc97000ad606ac12b4b90dc 100644 (file)
@@ -29,6 +29,12 @@ void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
+void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
+                               float *tmpbuf);
+void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
+                               float *tmpbuf);
+
+DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
 
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -100,7 +106,26 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     float sum;
 
     /* copy to avoid wrap */
-    memcpy(in + 512, in, 32 * sizeof(*in));
+    __asm__ volatile(
+            "movaps    0(%0), %%xmm0   \n\t" \
+            "movaps   16(%0), %%xmm1   \n\t" \
+            "movaps   32(%0), %%xmm2   \n\t" \
+            "movaps   48(%0), %%xmm3   \n\t" \
+            "movaps   %%xmm0,   0(%1) \n\t" \
+            "movaps   %%xmm1,  16(%1) \n\t" \
+            "movaps   %%xmm2,  32(%1) \n\t" \
+            "movaps   %%xmm3,  48(%1) \n\t" \
+            "movaps   64(%0), %%xmm0   \n\t" \
+            "movaps   80(%0), %%xmm1   \n\t" \
+            "movaps   96(%0), %%xmm2   \n\t" \
+            "movaps  112(%0), %%xmm3   \n\t" \
+            "movaps   %%xmm0,  64(%1) \n\t" \
+            "movaps   %%xmm1,  80(%1) \n\t" \
+            "movaps   %%xmm2,  96(%1) \n\t" \
+            "movaps   %%xmm3, 112(%1) \n\t"
+            ::"r"(in), "r"(in+512)
+            :"memory"
+            );
 
     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
@@ -153,26 +178,88 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
     *out = sum;
 }
 
+
+#define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
+static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
+                               int count, int switch_point, int block_type) \
+{                                                                           \
+    int align_end = count - (count & 3);                                \
+    int j;                                                              \
+    for (j = 0; j < align_end; j+= 4) {                                 \
+        LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
+        float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
+        /* apply window & overlap with previous buffer */               \
+                                                                        \
+        /* select window */                                             \
+        ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
+        in      += 4*18;                                                \
+        buf     += 4*18;                                                \
+        out     += 4;                                                   \
+    }                                                                   \
+    for (; j < count; j++) {                                            \
+        /* apply window & overlap with previous buffer */               \
+                                                                        \
+        /* select window */                                             \
+        int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
+        float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
+                                                                        \
+        ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
+                                                                        \
+        in  += 18;                                                      \
+        buf++;                                                          \
+        out++;                                                          \
+    }                                                                   \
+}
+
+#if HAVE_YASM
+#if HAVE_SSE
+DECL_IMDCT_BLOCKS(sse,sse)
+DECL_IMDCT_BLOCKS(sse2,sse)
+DECL_IMDCT_BLOCKS(sse3,sse)
+DECL_IMDCT_BLOCKS(ssse3,sse)
+#endif
+#if HAVE_AVX
+DECL_IMDCT_BLOCKS(avx,avx)
+#endif
+#endif
+
 void ff_mpadsp_init_mmx(MPADSPContext *s)
 {
     int mm_flags = av_get_cpu_flags();
 
+    int i, j;
+    for (j = 0; j < 4; j++) {
+        for (i = 0; i < 40; i ++) {
+            mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
+            mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
+            mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
+            mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
+            mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
+            mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
+            mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
+            mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
+        }
+    }
+
     if (mm_flags & AV_CPU_FLAG_SSE2) {
         s->apply_window_float = apply_window_mp3;
     }
-    if (HAVE_YASM && mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
-        s->imdct36_float = ff_imdct36_float_avx;
-    }
-    else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSE) {
-        s->imdct36_float = ff_imdct36_float_ssse3;
-    }
-    else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE3 && HAVE_SSE) {
-        s->imdct36_float = ff_imdct36_float_sse3;
-    }
-    else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
-        s->imdct36_float = ff_imdct36_float_sse2;
-    }
-    else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
-        s->imdct36_float = ff_imdct36_float_sse;
+#if HAVE_YASM
+    if (0) {
+#if HAVE_AVX
+    } else if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
+        s->imdct36_blocks_float = imdct36_blocks_avx;
+#endif
+#if HAVE_SSE
+    } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
+        s->imdct36_blocks_float = imdct36_blocks_ssse3;
+    } else if (mm_flags & AV_CPU_FLAG_SSE3) {
+        s->imdct36_blocks_float = imdct36_blocks_sse3;
+    } else if (mm_flags & AV_CPU_FLAG_SSE2) {
+        s->imdct36_blocks_float = imdct36_blocks_sse2;
+    } else if (mm_flags & AV_CPU_FLAG_SSE) {
+        s->imdct36_blocks_float = imdct36_blocks_sse;
+#endif /* HAVE_SSE */
     }
+#endif /* HAVE_YASM */
 }