MMX code for predictor rounding/clipping

author Fiona Glaser <fiona@x264.com>

Fri, 16 Apr 2010 19:06:07 +0000 (12:06 -0700)

committer Fiona Glaser <fiona@x264.com>

Fri, 23 Apr 2010 19:40:11 +0000 (12:40 -0700)
author Fiona Glaser <fiona@x264.com>
Fri, 16 Apr 2010 19:06:07 +0000 (12:06 -0700)
committer Fiona Glaser <fiona@x264.com>
Fri, 23 Apr 2010 19:40:11 +0000 (12:40 -0700)
diff --git a/common/common.h b/common/common.h

index c63fbd9ca1bb44a0ad57beb9a0a2404f56bd06f1..3973558a207539088fbc961087c6d7dbe30327c8 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -188,6 +188,17 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd
      return amvd0 + (amvd1<<8);
  }
  
+static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+{
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        int mx = (mvc[i][0] + 2) >> 2;
+        int my = (mvc[i][1] + 2) >> 2;
+        mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
+        mvc[i][0] = x264_clip3( my, mv_y_min, mv_y_max );
+    }
+}
+
  extern const uint8_t x264_exp2_lut[64];
  extern const float x264_log2_lut[128];
  extern const float x264_log2_lz_lut[32];
diff --git a/common/x86/util.h b/common/x86/util.h

index 89bc12780a66f454b62ed8b3a419ad6fa41ebc4a..2250a3dc7c8c989159be33cf82e8882388432a6e 100644 (file)
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -45,6 +45,7 @@ static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16
          :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
      );
  }
+
  #define x264_predictor_difference x264_predictor_difference_mmxext
  static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
  {
@@ -80,6 +81,7 @@ static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], in
      );
      return sum;
  }
+
  #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
  static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
  {
@@ -103,6 +105,45 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
      );
      return amvd;
  }
+
+#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
+static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+{
+    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
+    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
+    static const uint64_t pw_2 = 0x0002000200020002ULL;
+    intptr_t i = i_mvc;
+    asm(
+        "movd    %2, %%mm5       \n"
+        "movd    %3, %%mm6       \n"
+        "movq    %4, %%mm7       \n"
+        "punpckldq %%mm5, %%mm5  \n"
+        "punpckldq %%mm6, %%mm6  \n"
+        "test $1, %0             \n"
+        "jz 1f                   \n"
+        "movd -4(%5,%0,4), %%mm0 \n"
+        "paddw %%mm7, %%mm0      \n"
+        "psraw $2, %%mm0         \n"
+        "pmaxsw %%mm5, %%mm0     \n"
+        "pminsw %%mm6, %%mm0     \n"
+        "movd %%mm0, -4(%5,%0,4) \n"
+        "dec %0                  \n"
+        "jz 2f                   \n"
+        "1:                      \n"
+        "movq -8(%5,%0,4), %%mm0 \n"
+        "paddw %%mm7, %%mm0      \n"
+        "psraw $2, %%mm0         \n"
+        "pmaxsw %%mm5, %%mm0     \n"
+        "pminsw %%mm6, %%mm0     \n"
+        "movq %%mm0, -8(%5,%0,4) \n"
+        "sub $2, %0              \n"
+        "jnz 1b                  \n"
+        "2:                      \n"
+        :"+r"(i), "+m"(M64( mvc ))
+        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc)
+    );
+}
+
  #undef M128_ZERO
  #define M128_ZERO ((__m128){0,0,0,0})
  #define x264_union128_t x264_union128_sse_t
diff --git a/encoder/me.c b/encoder/me.c

index 4be14d4ef614fe4d64902e048430217793da011d..5b7f1bd68354a42eb334b7bf841da54e83ee0a76 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -241,14 +241,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
           * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
           * biasing against use of the predicted motion vector. */
          bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
+        uint32_t bmv = pack16to32_mask( bmx, bmy );
+        if( i_mvc )
+            x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
          for( int i = 0; i < i_mvc; i++ )
          {
-            int mx = (mvc[i][0] + 2) >> 2;
-            int my = (mvc[i][1] + 2) >> 2;
-            if( (mx | my) && ((mx-bmx) | (my-bmy)) )
+            if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
              {
-                mx = x264_clip3( mx, mv_x_min, mv_x_max );
-                my = x264_clip3( my, mv_y_min, mv_y_max );
+                int mx = mvc[i][0];
+                int my = mvc[i][1];
                  COST_MV( mx, my );
              }
          }
author	Fiona Glaser <fiona@x264.com>
	Fri, 16 Apr 2010 19:06:07 +0000 (12:06 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Fri, 23 Apr 2010 19:40:11 +0000 (12:40 -0700)
common/common.h		patch \| blob \| history
common/x86/util.h		patch \| blob \| history
encoder/me.c		patch \| blob \| history