Merge commit '88bd7fdc821aaa0cbcf44cf075c62aaa42121e3f'

[ffmpeg] / libavcodec / x86 / dsputil_mmx.c
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c

index 13f215135a1140e1e0f68f51a2c54842944af46a..2bbe7ff9d38fe8f5fa499d87c5ff221a9ecf121f 100644 (file)
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -3,23 +3,23 @@
   * Copyright (c) 2000, 2001 Fabrice Bellard
   * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   *
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
- *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
   *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
   * modify it under the terms of the GNU Lesser General Public
   * License as published by the Free Software Foundation; either
   * version 2.1 of the License, or (at your option) any later version.
   *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   * Lesser General Public License for more details.
   *
   * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
   */
  
  #include "libavutil/cpu.h"
@@ -30,6 +30,7 @@
  #include "libavcodec/simple_idct.h"
  #include "dsputil_mmx.h"
  #include "idct_xvid.h"
+#include "diracdsp_mmx.h"
  
  //#undef NDEBUG
  //#include <assert.h>
@@ -38,9 +39,6 @@
  DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
  DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
  
-DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
-    { 0x8000000080000000ULL, 0x8000000080000000ULL };
-
  DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
  DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
  DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
@@ -228,10 +226,10 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
  /***********************************/
  /* standard MMX */
  
-void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
+void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                 int line_size)
  {
-    const DCTELEM *p;
+    const int16_t *p;
      uint8_t *pix;
  
      /* read the pixels */
@@ -303,7 +301,7 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
      "movq               %%mm3, (%0, %3, 2)  \n\t"           \
      "movq               %%mm4, (%0, %1)     \n\t"
  
-void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
+void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                        int line_size)
  {
      x86_reg line_skip = line_size;
@@ -320,10 +318,10 @@ void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
          : "memory");
  }
  
-void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
+void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
                                 int line_size)
  {
-    const DCTELEM *p;
+    const int16_t *p;
      uint8_t *pix;
      int i;
  
@@ -425,7 +423,7 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
  }
  
  #define CLEAR_BLOCKS(name, n)                           \
-static void name(DCTELEM *blocks)                       \
+static void name(int16_t *blocks)                       \
  {                                                       \
      __asm__ volatile (                                  \
          "pxor %%mm7, %%mm7              \n\t"           \
@@ -445,7 +443,7 @@ static void name(DCTELEM *blocks)                       \
  CLEAR_BLOCKS(clear_blocks_mmx, 6)
  CLEAR_BLOCKS(clear_block_mmx, 1)
  
-static void clear_block_sse(DCTELEM *block)
+static void clear_block_sse(int16_t *block)
  {
      __asm__ volatile (
          "xorps  %%xmm0, %%xmm0          \n"
@@ -462,7 +460,7 @@ static void clear_block_sse(DCTELEM *block)
      );
  }
  
-static void clear_blocks_sse(DCTELEM *blocks)
+static void clear_blocks_sse(int16_t *blocks)
  {
      __asm__ volatile (
          "xorps  %%xmm0, %%xmm0              \n"
@@ -751,7 +749,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
              : "+r"(ptr)
              : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
              );
-    } else {
+    } else if(w==16){
          __asm__ volatile (
              "1:                                 \n\t"
              "movd            (%0), %%mm0        \n\t"
@@ -772,6 +770,25 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
              : "+r"(ptr)
              : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
              );
+    } else {
+        av_assert1(w == 4);
+        __asm__ volatile (
+            "1:                             \n\t"
+            "movd            (%0), %%mm0    \n\t"
+            "punpcklbw      %%mm0, %%mm0    \n\t"
+            "punpcklwd      %%mm0, %%mm0    \n\t"
+            "movd           %%mm0, -4(%0)   \n\t"
+            "movd      -4(%0, %2), %%mm1    \n\t"
+            "punpcklbw      %%mm1, %%mm1    \n\t"
+            "punpckhwd      %%mm1, %%mm1    \n\t"
+            "punpckhdq      %%mm1, %%mm1    \n\t"
+            "movd           %%mm1, (%0, %2) \n\t"
+            "add               %1, %0       \n\t"
+            "cmp               %3, %0       \n\t"
+            "jb                1b           \n\t"
+            : "+r"(ptr)
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
+            );
      }
  
      /* top and bottom (and hopefully also the corners) */
@@ -1557,63 +1574,6 @@ QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,        mmxext)
  QPEL_OP(avg_,          ff_pw_16, _,        AVG_MMXEXT_OP, mmxext)
  QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,        mmxext)
  
-/***********************************/
-/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
-
-#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)                              \
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
-                                                                 uint8_t *src, \
-                                                                 int stride)   \
-{                                                                              \
-    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);                  \
-}
-
-#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)                        \
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
-                                                                 uint8_t *src, \
-                                                                 int stride)   \
-{                                                                              \
-    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE,    \
-                                               S1, S2);                        \
-}
-
-#define QPEL_2TAP(OPNAME, SIZE, MMX)                                        \
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)                            \
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)                            \
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)                               \
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =    \
-    OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;                                \
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =    \
-    OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;                           \
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =    \
-    OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;                           \
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst,      \
-                                                         uint8_t *src,      \
-                                                         int stride)        \
-{                                                                           \
-    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE);    \
-}                                                                           \
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst,      \
-                                                         uint8_t *src,      \
-                                                         int stride)        \
-{                                                                           \
-    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride,              \
-                                            stride, SIZE);                  \
-}                                                                           \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,           1,       0)                \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,          -1,       0)                \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,           stride,  0)                \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,     -stride,  0)                \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,           stride,  1)                \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,           stride, -1)                \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,     -stride,  1)                \
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1)                \
-
-QPEL_2TAP(put_, 16, mmxext)
-QPEL_2TAP(avg_, 16, mmxext)
-QPEL_2TAP(put_,  8, mmxext)
-QPEL_2TAP(avg_,  8, mmxext)
-
  void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
  {
    put_pixels8_xy2_mmx(dst, src, stride, 8);
@@ -1631,10 +1591,15 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
    avg_pixels16_xy2_mmx(dst, src, stride, 16);
  }
  
-static void gmc_mmx(uint8_t *dst, uint8_t *src,
-                    int stride, int h, int ox, int oy,
-                    int dxx, int dxy, int dyx, int dyy,
-                    int shift, int r, int width, int height)
+typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
+                                   ptrdiff_t linesize, int block_w, int block_h,
+                                   int src_x, int src_y, int w, int h);
+
+static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
+                                 int stride, int h, int ox, int oy,
+                                 int dxx, int dxy, int dyx, int dyy,
+                                 int shift, int r, int width, int height,
+                                 emulated_edge_mc_func *emu_edge_fn)
  {
      const int w    = 8;
      const int ix   = ox  >> (16 + shift);
@@ -1649,19 +1614,24 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
      const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
      const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
      const uint64_t shift2 = 2 * shift;
+#define MAX_STRIDE 4096U
+#define MAX_H 8U
+    uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
      int x, y;
  
      const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
      const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
      const int dxh = dxy * (h - 1);
      const int dyw = dyx * (w - 1);
+    int need_emu =  (unsigned)ix >= width  - w ||
+                    (unsigned)iy >= height - h;
+
      if ( // non-constant fullpel offset (3% of blocks)
          ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
           (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
          // uses more than 16 bits of subpel mv (only at huge resolution)
-        || (dxx | dxy | dyx | dyy) & 15 ||
-        (unsigned)ix >= width  - w ||
-        (unsigned)iy >= height - h) {
+        || (dxx | dxy | dyx | dyy) & 15
+        || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
          // FIXME could still use mmx for some of the rows
          ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
                   shift, r, width, height);
@@ -1669,6 +1639,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
      }
  
      src += ix + iy * stride;
+    if (need_emu) {
+        emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
+        src = edge_buf;
+    }
  
      __asm__ volatile (
          "movd         %0, %%mm6         \n\t"
@@ -1746,6 +1720,39 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src,
          src += 4 - h * stride;
      }
  }
+
+#if CONFIG_VIDEODSP
+#if HAVE_YASM
+#if ARCH_X86_32
+static void gmc_mmx(uint8_t *dst, uint8_t *src,
+                    int stride, int h, int ox, int oy,
+                    int dxx, int dxy, int dyx, int dyy,
+                    int shift, int r, int width, int height)
+{
+    gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+        width, height, &ff_emulated_edge_mc_8);
+}
+#endif
+static void gmc_sse(uint8_t *dst, uint8_t *src,
+                    int stride, int h, int ox, int oy,
+                    int dxx, int dxy, int dyx, int dyy,
+                    int shift, int r, int width, int height)
+{
+    gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+        width, height, &ff_emulated_edge_mc_8);
+}
+#else
+static void gmc_mmx(uint8_t *dst, uint8_t *src,
+                    int stride, int h, int ox, int oy,
+                    int dxx, int dxy, int dyx, int dyy,
+                    int shift, int r, int width, int height)
+{
+    gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+        width, height, &ff_emulated_edge_mc_8);
+}
+#endif
+#endif
+
  #endif /* HAVE_INLINE_ASM */
  
  #include "h264_qpel.c"
@@ -1829,65 +1836,80 @@ void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
      avg_pixels8_mmxext(dst, src, stride, 8);
  }
  
-static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
+#if CONFIG_DIRAC_DECODER
+#define DIRAC_PIXOP(OPNAME, EXT)\
+void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
+}\
+void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
+{\
+    OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
+    OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
+}
+
+DIRAC_PIXOP(put, mmx)
+DIRAC_PIXOP(avg, mmx)
+DIRAC_PIXOP(avg, mmxext)
+
+#if HAVE_YASM
+void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
  {
-    int i;
-    __asm__ volatile ("pxor %%mm7, %%mm7":);
-    for (i = 0; i < blocksize; i += 2) {
-        __asm__ volatile (
-            "movq       %0, %%mm0   \n\t"
-            "movq       %1, %%mm1   \n\t"
-            "movq    %%mm0, %%mm2   \n\t"
-            "movq    %%mm1, %%mm3   \n\t"
-            "pfcmpge %%mm7, %%mm2   \n\t" // m <= 0.0
-            "pfcmpge %%mm7, %%mm3   \n\t" // a <= 0.0
-            "pslld     $31, %%mm2   \n\t" // keep only the sign bit
-            "pxor    %%mm2, %%mm1   \n\t"
-            "movq    %%mm3, %%mm4   \n\t"
-            "pand    %%mm1, %%mm3   \n\t"
-            "pandn   %%mm1, %%mm4   \n\t"
-            "pfadd   %%mm0, %%mm3   \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
-            "pfsub   %%mm4, %%mm0   \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
-            "movq    %%mm3, %1      \n\t"
-            "movq    %%mm0, %0      \n\t"
-            : "+m"(mag[i]), "+m"(ang[i])
-            :: "memory"
-        );
-    }
-    __asm__ volatile ("femms");
+    ff_put_pixels16_sse2(dst, src[0], stride, h);
  }
+void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    ff_avg_pixels16_sse2(dst, src[0], stride, h);
+}
+void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
+    ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
+}
+void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
+{
+    ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
+    ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
+}
+#endif
+#endif
  
-static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
+/* XXX: Those functions should be suppressed ASAP when all IDCTs are
+ * converted. */
+#if CONFIG_GPL
+static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
+                                    int16_t *block)
  {
-    int i;
+    ff_mmx_idct(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
  
-    __asm__ volatile (
-        "movaps  %0, %%xmm5 \n\t"
-        :: "m"(ff_pdw_80000000[0])
-    );
-    for (i = 0; i < blocksize; i += 4) {
-        __asm__ volatile (
-            "movaps      %0, %%xmm0 \n\t"
-            "movaps      %1, %%xmm1 \n\t"
-            "xorps   %%xmm2, %%xmm2 \n\t"
-            "xorps   %%xmm3, %%xmm3 \n\t"
-            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
-            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
-            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
-            "xorps   %%xmm2, %%xmm1 \n\t"
-            "movaps  %%xmm3, %%xmm4 \n\t"
-            "andps   %%xmm1, %%xmm3 \n\t"
-            "andnps  %%xmm1, %%xmm4 \n\t"
-            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
-            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
-            "movaps  %%xmm3, %1     \n\t"
-            "movaps  %%xmm0, %0     \n\t"
-            : "+m"(mag[i]), "+m"(ang[i])
-            :: "memory"
-        );
-    }
+static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
+                                    int16_t *block)
+{
+    ff_mmx_idct(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
  }
  
+static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
+                                     int16_t *block)
+{
+    ff_mmxext_idct(block);
+    ff_put_pixels_clamped_mmx(block, dest, line_size);
+}
+
+static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
+                                     int16_t *block)
+{
+    ff_mmxext_idct(block);
+    ff_add_pixels_clamped_mmx(block, dest, line_size);
+}
+#endif
+
  static void vector_clipf_sse(float *dst, const float *src,
                               float min, float max, int len)
  {
@@ -1962,18 +1984,6 @@ int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
  int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
                                        int w, int left);
  
-float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
-
-void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
-                                const float *src1, int len);
-void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
-                                const float *src1, int len);
-
-void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
-                            const float *src2, int len);
-void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
-                            const float *src2, int len);
-
  void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
                                     int32_t min, int32_t max, unsigned int len);
  void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
@@ -1983,11 +1993,6 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
  void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
                                     int32_t min, int32_t max, unsigned int len);
  
-extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
-                                                const float *src1, int len);
-extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
-                                                const float *src1, int len);
-
  #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
      do {                                                                     \
      c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
@@ -2054,24 +2059,11 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
          SET_HPEL_FUNCS(put_no_rnd, 1,  8, mmx);
          SET_HPEL_FUNCS(avg,        1,  8, mmx);
          SET_HPEL_FUNCS(avg_no_rnd, 1,  8, mmx);
-
-        switch (avctx->idct_algo) {
-        case FF_IDCT_AUTO:
-        case FF_IDCT_SIMPLEMMX:
-            c->idct_put              = ff_simple_idct_put_mmx;
-            c->idct_add              = ff_simple_idct_add_mmx;
-            c->idct                  = ff_simple_idct_mmx;
-            c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
-            break;
-        case FF_IDCT_XVIDMMX:
-            c->idct_put              = ff_idct_xvid_mmx_put;
-            c->idct_add              = ff_idct_xvid_mmx_add;
-            c->idct                  = ff_idct_xvid_mmx;
-            break;
-        }
      }
  
+#if ARCH_X86_32 || !HAVE_YASM
      c->gmc = gmc_mmx;
+#endif
  
      c->add_bytes = add_bytes_mmx;
  
@@ -2101,13 +2093,9 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
  #if HAVE_INLINE_ASM
      SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
      SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
-    SET_QPEL_FUNCS(avg_2tap_qpel,   0, 16, mmxext, );
-    SET_QPEL_FUNCS(avg_2tap_qpel,   1,  8, mmxext, );
  
      SET_QPEL_FUNCS(put_qpel,        0, 16, mmxext, );
      SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
-    SET_QPEL_FUNCS(put_2tap_qpel,   0, 16, mmxext, );
-    SET_QPEL_FUNCS(put_2tap_qpel,   1,  8, mmxext, );
      SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
      SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
  
@@ -2139,12 +2127,6 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
          }
      }
  
-    if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
-        c->idct_put = ff_idct_xvid_mmxext_put;
-        c->idct_add = ff_idct_xvid_mmxext_add;
-        c->idct     = ff_idct_xvid_mmxext;
-    }
-
      if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
                                 avctx->codec_id == AV_CODEC_ID_THEORA)) {
          c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
@@ -2238,8 +2220,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
          c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
          c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
      }
-
-    c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
  #endif /* HAVE_INLINE_ASM */
  
  #if HAVE_YASM
@@ -2263,17 +2243,13 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
          }
      }
  
-    c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
-
      c->vector_clipf = vector_clipf_sse;
  #endif /* HAVE_INLINE_ASM */
  
  #if HAVE_YASM
-    c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
-    c->vector_fmul_add     = ff_vector_fmul_add_sse;
-
-    c->scalarproduct_float          = ff_scalarproduct_float_sse;
-    c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
+#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
+    c->gmc = gmc_sse;
+#endif
  #endif /* HAVE_YASM */
  }
  
@@ -2424,9 +2400,6 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
              c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
          }
      }
-    c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
-    c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
-    c->vector_fmul_add = ff_vector_fmul_add_avx;
  #endif /* HAVE_AVX_EXTERNAL */
  }
  
@@ -2439,8 +2412,50 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
          c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
  #endif
  
-    if (mm_flags & AV_CPU_FLAG_MMX)
+    if (mm_flags & AV_CPU_FLAG_MMX) {
+#if HAVE_INLINE_ASM
+        const int idct_algo = avctx->idct_algo;
+
+        if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
+            if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
+                c->idct_put              = ff_simple_idct_put_mmx;
+                c->idct_add              = ff_simple_idct_add_mmx;
+                c->idct                  = ff_simple_idct_mmx;
+                c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
+#if CONFIG_GPL
+            } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
+                if (mm_flags & AV_CPU_FLAG_MMX2) {
+                    c->idct_put = ff_libmpeg2mmx2_idct_put;
+                    c->idct_add = ff_libmpeg2mmx2_idct_add;
+                    c->idct     = ff_mmxext_idct;
+                } else {
+                    c->idct_put = ff_libmpeg2mmx_idct_put;
+                    c->idct_add = ff_libmpeg2mmx_idct_add;
+                    c->idct     = ff_mmx_idct;
+                }
+                c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
+#endif
+            } else if (idct_algo == FF_IDCT_XVIDMMX) {
+                if (mm_flags & AV_CPU_FLAG_SSE2) {
+                    c->idct_put              = ff_idct_xvid_sse2_put;
+                    c->idct_add              = ff_idct_xvid_sse2_add;
+                    c->idct                  = ff_idct_xvid_sse2;
+                    c->idct_permutation_type = FF_SSE2_IDCT_PERM;
+                } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
+                    c->idct_put              = ff_idct_xvid_mmxext_put;
+                    c->idct_add              = ff_idct_xvid_mmxext_add;
+                    c->idct                  = ff_idct_xvid_mmxext;
+                } else {
+                    c->idct_put              = ff_idct_xvid_mmx_put;
+                    c->idct_add              = ff_idct_xvid_mmx_add;
+                    c->idct                  = ff_idct_xvid_mmx;
+                }
+            }
+        }
+#endif /* HAVE_INLINE_ASM */
+
          dsputil_init_mmx(c, avctx, mm_flags);
+    }
  
      if (mm_flags & AV_CPU_FLAG_MMXEXT)
          dsputil_init_mmxext(c, avctx, mm_flags);