Much faster and more efficient MVD handling

author Fiona Glaser <fiona@x264.com>

Fri, 19 Feb 2010 01:01:38 +0000 (17:01 -0800)

committer Fiona Glaser <fiona@x264.com>

Tue, 23 Feb 2010 09:46:16 +0000 (01:46 -0800)
author Fiona Glaser <fiona@x264.com>
Fri, 19 Feb 2010 01:01:38 +0000 (17:01 -0800)
committer Fiona Glaser <fiona@x264.com>
Tue, 23 Feb 2010 09:46:16 +0000 (01:46 -0800)
diff --git a/common/common.h b/common/common.h

index 8562aed87f8fbaa507e7100c19e8e13970daa4fc..661eda6d7a31dc277799285cda98e925d109d13d 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -171,13 +171,13 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
      return sum;
  }
  
-static inline uint32_t x264_cabac_amvd_sum( int16_t *mvdleft, int16_t *mvdtop )
+static inline uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop )
  {
      int amvd0 = abs(mvdleft[0]) + abs(mvdtop[0]);
      int amvd1 = abs(mvdleft[1]) + abs(mvdtop[1]);
      amvd0 = (amvd0 > 2) + (amvd0 > 32);
      amvd1 = (amvd1 > 2) + (amvd1 > 32);
-    return amvd0 + (amvd1<<16);
+    return amvd0 + (amvd1<<8);
  }
  
  extern const uint8_t x264_exp2_lut[64];
@@ -527,7 +527,7 @@ struct x264_t
          uint8_t (*non_zero_count)[16+4+4];  /* nzc. for I_PCM set to 16 */
          int8_t  *chroma_pred_mode;          /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */
          int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
-        int16_t (*mvd[2])[2];               /* mb mv difference with predict. set to 0 if intra. cabac only */
+        uint8_t (*mvd[2])[2];               /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */
          int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only) */
          int16_t (*mvr[2][32])[2];           /* 16x16 mv for each possible ref */
          int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
@@ -621,7 +621,7 @@ struct x264_t
  
              /* 0 if not available */
              ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
-            ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
+            ALIGNED_8( uint8_t mvd[2][X264_SCAN8_SIZE][2] );
  
              /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
              ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
diff --git a/common/macroblock.c b/common/macroblock.c

index fc8c9c40fd994b0a4495e0ca31e8037e128f74ae..decc0319bea8f27f570a5f2f42f6333cf063967c 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -712,8 +712,8 @@ int x264_macroblock_cache_init( x264_t *h )
      if( h->param.b_cabac )
      {
          CHECKED_MALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) );
-        CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(int16_t) );
-        CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(int16_t) );
+        CHECKED_MALLOC( h->mb.mvd[0], 2*16 * i_mb_count * sizeof(uint8_t) );
+        CHECKED_MALLOC( h->mb.mvd[1], 2*16 * i_mb_count * sizeof(uint8_t) );
      }
  
      for( i=0; i<2; i++ )
@@ -1211,33 +1211,24 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
              if( h->param.b_cabac )
              {
                  if( i_top_type >= 0 )
-                {
-                    const int i8 = x264_scan8[0] - 8;
-                    const int iv = i_top_4x4;
-                    CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
-                    CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
-                }
+                    CP64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8], h->mb.mvd[i_list][i_top_4x4] );
                  else
-                {
-                    const int i8 = x264_scan8[0] - 8;
-                    M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
-                    M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
-                }
+                    M64( h->mb.cache.mvd[i_list][x264_scan8[0] - 8] ) = 0;
  
                  if( i_left_type >= 0 )
                  {
                      const int i8 = x264_scan8[0] - 1;
                      const int iv = i_mb_4x4 - 1;
-                    CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
-                    CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
-                    CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
-                    CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
+                    CP16( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
+                    CP16( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
+                    CP16( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
+                    CP16( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
                  }
                  else
                  {
                      const int i8 = x264_scan8[0] - 1;
                      for( i = 0; i < 4; i++ )
-                        M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
+                        M16( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
                  }
              }
          }
@@ -1416,30 +1407,18 @@ void x264_macroblock_cache_save( x264_t *h )
          if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
          {
              for( y = 0; y < 4; y++ )
-            {
-                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
-                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
-            }
+                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4], h->mb.cache.mvd[0][x264_scan8[0]+8*y] );
              if( h->sh.i_type == SLICE_TYPE_B )
                  for( y = 0; y < 4; y++ )
-                {
-                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
-                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
-                }
+                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4], h->mb.cache.mvd[1][x264_scan8[0]+8*y] );
          }
          else
          {
              for( y = 0; y < 4; y++ )
-            {
-                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
-                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
-            }
+                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4] ) = 0;
              if( h->sh.i_type == SLICE_TYPE_B )
                  for( y = 0; y < 4; y++ )
-                {
-                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
-                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
-                }
+                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4] ) = 0;
          }
  
          if( h->sh.i_type == SLICE_TYPE_B )
diff --git a/common/macroblock.h b/common/macroblock.h

index 086eec37c7c74fa9bcdd5d295bb65953fa8e213a..d173be4ded5a9fefc9c47c1dd1a8b7e94f162036 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -353,6 +353,33 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int
          if( height == 4 ) M16( d+6 ) = val2;
      }
  }
+static ALWAYS_INLINE void x264_macroblock_cache_rect2( void *dst, int width, int height, uint16_t val )
+{
+    uint16_t *d = dst;
+    uint32_t val32 = val + (val<<16);
+    uint64_t val64 = val32 + ((uint64_t)val32<<32);
+    if( width == 4 )
+    {
+                          M64( d+ 0 ) = val64;
+        if( height >= 2 ) M64( d+ 8 ) = val64;
+        if( height == 4 ) M64( d+16 ) = val64;
+        if( height == 4 ) M64( d+24 ) = val64;
+    }
+    else if( width == 2 )
+    {
+                          M32( d+ 0 ) = val32;
+        if( height >= 2 ) M32( d+ 8 ) = val32;
+        if( height == 4 ) M32( d+16 ) = val32;
+        if( height == 4 ) M32( d+24 ) = val32;
+    }
+    else //if( width == 1 )
+    {
+                          M16( d+ 0 ) = val;
+        if( height >= 2 ) M16( d+ 8 ) = val;
+        if( height == 4 ) M16( d+16 ) = val;
+        if( height == 4 ) M16( d+24 ) = val;
+    }
+}
  static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
  {
      int dy;
@@ -383,9 +410,9 @@ static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int
  {
      x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
  }
-static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
+static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mv )
  {
-    x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
+    x264_macroblock_cache_rect2( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
  }
  static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
  {
diff --git a/common/x86/util.h b/common/x86/util.h

index da8437a3d197d87d4682dbd7fffd5aa0bbd3ed10..7672f09e0711809c7ddf31a4287ccb8f5b7e8dca 100644 (file)
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -77,32 +77,26 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
      );
      return sum;
  }
-#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
-static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop)
+#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
+static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
  {
-    static const uint64_t pw_2    = 0x0002000200020002ULL;
-    static const uint64_t pw_28   = 0x001C001C001C001CULL;
-    static const uint64_t pw_2184 = 0x0888088808880888ULL;
-    /* MIN(((x+28)*2184)>>16,2) = (x>2) + (x>32) */
-    /* 2184 = fix16(1/30) */
-    uint32_t amvd;
+    static const uint64_t pb_2    = 0x0202020202020202ULL;
+    static const uint64_t pb_32   = 0x2020202020202020ULL;
+    int amvd;
      asm(
-        "movd      %1, %%mm0 \n"
-        "movd      %2, %%mm1 \n"
-        "pxor   %%mm2, %%mm2 \n"
-        "pxor   %%mm3, %%mm3 \n"
-        "psubw  %%mm0, %%mm2 \n"
-        "psubw  %%mm1, %%mm3 \n"
-        "pmaxsw %%mm2, %%mm0 \n"
-        "pmaxsw %%mm3, %%mm1 \n"
-        "paddw     %3, %%mm0 \n"
-        "paddw  %%mm1, %%mm0 \n"
-        "pmulhuw   %4, %%mm0 \n"
-        "pminsw    %5, %%mm0 \n"
-        "movd   %%mm0, %0    \n"
+        "movd         %1, %%mm0 \n"
+        "movd         %2, %%mm1 \n"
+        "paddb     %%mm1, %%mm0 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "movq      %%mm0, %%mm1 \n"
+        "pcmpgtb      %3, %%mm0 \n"
+        "pcmpgtb      %4, %%mm1 \n"
+        "psubb     %%mm0, %%mm2 \n"
+        "psubb     %%mm1, %%mm2 \n"
+        "movd      %%mm2, %0    \n"
          :"=r"(amvd)
-        :"m"(M32( mvdleft )),"m"(M32( mvdtop )),
-         "m"(pw_28),"m"(pw_2184),"m"(pw_2)
+        :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
+         "m"(pb_2),"m"(pb_32)
      );
      return amvd;
  }
diff --git a/encoder/cabac.c b/encoder/cabac.c

index a2220c66f740cd3400d1256ed8c150a6b0a9c87d..dc1d1b8ac2a6f816cbea96c9ad5f6e1fe9359359 100644 (file)
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -349,7 +349,7 @@ static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx
      x264_cabac_encode_decision( cb, 54 + ctx, 0 );
  }
  
-static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
+static inline int x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx )
  {
      const int i_abs = abs( mvd );
      const int ctxbase = l ? 47 : 40;
@@ -408,32 +408,34 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis
          x264_cabac_encode_bypass( cb, mvd < 0 );
      }
  #endif
+    /* Since we don't need to keep track of MVDs larger than 33, just cap the value.
+     * This lets us store MVDs as 8-bit values instead of 16-bit. */
+    return X264_MIN( i_abs, 33 );
  }
  
-static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
+static NOINLINE uint16_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
  {
      ALIGNED_4( int16_t mvp[2] );
-    uint32_t amvd;
      int mdx, mdy;
  
      /* Calculate mvd */
      x264_mb_predict_mv( h, i_list, idx, width, mvp );
      mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0];
      mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
-    amvd = x264_cabac_amvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
-                               h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
+    uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1],
+                                       h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]);
  
      /* encode */
-    x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFFFF );
-    x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>16 );
+    mdx = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF );
+    mdy = x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 );
  
-    return pack16to32_mask(mdx,mdy);
+    return pack8to16(mdx,mdy);
  }
  
  #define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\
  do\
  {\
-    uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
+    uint16_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\
      x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\
  } while(0)
  
diff --git a/encoder/me.c b/encoder/me.c

index 8972d459c59479e51c59ab409ffa9903e8cab8b1..5f29a640d5be513b65f0ed5a7f4484e5b842e105 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -1174,6 +1174,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
      m->mv[0] = bmx;
      m->mv[1] = bmy;
      x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
-    x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) );
+    uint16_t amvd = pack8to16(X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33));
+    x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
      h->mb.b_skip_mc = 0;
  }
author	Fiona Glaser <fiona@x264.com>
	Fri, 19 Feb 2010 01:01:38 +0000 (17:01 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 23 Feb 2010 09:46:16 +0000 (01:46 -0800)
common/common.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/x86/util.h		patch \| blob \| history
encoder/cabac.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history