Much faster CAVLC residual coding

author Fiona Glaser <fiona@x264.com>

Mon, 8 Dec 2008 21:44:23 +0000 (13:44 -0800)

committer Fiona Glaser <fiona@x264.com>

Thu, 11 Dec 2008 06:48:18 +0000 (22:48 -0800)
author Fiona Glaser <fiona@x264.com>
Mon, 8 Dec 2008 21:44:23 +0000 (13:44 -0800)
committer Fiona Glaser <fiona@x264.com>
Thu, 11 Dec 2008 06:48:18 +0000 (22:48 -0800)
diff --git a/common/bs.h b/common/bs.h

index f1be7a829618b672ccf6b90997660ce472f6bb0c..613b2b0e75bba8d74d361ac1bc17cf6f02051e71 100644 (file)
--- a/common/bs.h
+++ b/common/bs.h
@@ -31,6 +31,14 @@ typedef struct
      uint8_t i_size;
  } vlc_t;
  
+typedef struct
+{
+    uint16_t i_bits;
+    uint8_t  i_size;
+    /* Next level table to use */
+    uint8_t  i_next;
+} vlc_large_t;
+
  typedef struct bs_s
  {
      uint8_t *p_start;
@@ -47,6 +55,14 @@ extern const vlc_t x264_total_zeros[15][16];
  extern const vlc_t x264_total_zeros_dc[3][4];
  extern const vlc_t x264_run_before[7][15];
  
+/* A larger level table size theoretically could help a bit at extremely
+ * high bitrates, but the cost in cache is usually too high for it to be
+ * useful.
+ * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
+ * FIXME: Do further testing? */
+#define LEVEL_TABLE_SIZE 128
+extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+
  static inline void bs_init( bs_t *s, void *p_data, int i_data )
  {
      int offset = ((intptr_t)p_data & (WORD_SIZE-1));
diff --git a/common/common.h b/common/common.h

index c8405fc3242a5db29be5a32b7eb37d7dbeda2134..f2a0c54a9bc5f2c6b8659b693fdfd4dd3cb01eea 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -99,6 +99,7 @@ char *x264_param2string( x264_param_t *p, int b_res );
  void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
  
  void x264_reduce_fraction( int *n, int *d );
+void x264_init_vlc_tables();
  
  static inline uint8_t x264_clip_uint8( int x )
  {
diff --git a/common/macroblock.h b/common/macroblock.h

index f38c047836b38389978d3381a65761015095886e..708c449fdd49cdbf68b7e4f2484719da956241d5 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -424,18 +424,6 @@ static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
          return 0;
      }
  }
-/* This function and its MMX version only work on arrays of size 16 */
-static ALWAYS_INLINE int array_non_zero_count( int16_t *v )
-{
-    int i;
-    int i_nz;
-
-    for( i = 0, i_nz = 0; i < 16; i++ )
-        if( v[i] )
-            i_nz++;
-
-    return i_nz;
-}
  static inline int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
  {
      const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
diff --git a/common/vlc.c b/common/vlc.c

index 26ab90f9153730a8e71c30c9b6de0699a0df1551..6c159631436329381f302200e34bcc58adbc73a2 100644 (file)
--- a/common/vlc.c
+++ b/common/vlc.c
@@ -884,3 +884,49 @@ const vlc_t x264_run_before[7][15] =
          MKVLC( 0x1, 11 ), /* str=00000000001 */
      },
  };
+
+vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+
+void x264_init_vlc_tables()
+{
+    int16_t level;
+    int i_suffix;
+    for( i_suffix = 0; i_suffix < 7; i_suffix++ )
+        for( level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
+        {
+            int mask = level >> 15;
+            int abs_level = (level^mask)-mask;
+            int i_level_code = abs_level*2-mask-2;
+            int i_next = i_suffix;
+            vlc_large_t *vlc = &x264_level_token[i_suffix][level+LEVEL_TABLE_SIZE/2];
+
+            if( ( i_level_code >> i_suffix ) < 14 )
+            {
+                vlc->i_size = (i_level_code >> i_suffix) + 1 + i_suffix;
+                vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
+            }
+            else if( i_suffix == 0 && i_level_code < 30 )
+            {
+                vlc->i_size = 19;
+                vlc->i_bits = (1<<4) + (i_level_code - 14);
+            }
+            else if( i_suffix > 0 && ( i_level_code >> i_suffix ) == 14 )
+            {
+                vlc->i_size = 15 + i_suffix;
+                vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
+            }
+            else
+            {
+                i_level_code -= 15 << i_suffix;
+                if( i_suffix == 0 )
+                    i_level_code -= 15;
+                vlc->i_size = 28;
+                vlc->i_bits = (1<<12) + i_level_code;
+            }
+            if( i_next == 0 )
+                i_next++;
+            if( abs_level > (3 << (i_next-1)) && i_next < 6 )
+                i_next++;
+            vlc->i_next = i_next;
+        }
+}
diff --git a/common/x86/util.h b/common/x86/util.h

index 07317a24e31a2a04ac6faa24c1fab7f075065f6a..b86f37aa181c43b47270799cfc9e688bcef53fe7 100644 (file)
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -74,26 +74,6 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
      sum += output[0] + output[1] + output[2] + output[3];
      return sum;
  }
-#define array_non_zero_count array_non_zero_count_mmx
-static inline int array_non_zero_count_mmx( int16_t *v )
-{
-    int count;
-    asm(
-        "pxor     %%mm7,  %%mm7 \n"
-        "movq     (%1),   %%mm0 \n"
-        "movq     8(%1),  %%mm1 \n"
-        "packsswb 16(%1), %%mm0 \n"
-        "packsswb 24(%1), %%mm1 \n"
-        "pcmpeqb  %%mm7,  %%mm0 \n"
-        "pcmpeqb  %%mm7,  %%mm1 \n"
-        "paddb    %%mm0,  %%mm1 \n"
-        "psadbw   %%mm7,  %%mm1 \n"
-        "movd     %%mm1,  %0    \n"
-        :"=r"(count)
-        :"r"(v), "m"(*(struct {int16_t x[16];} *)v)
-    );
-    return (count+0x10)&0xff;
-}
  #undef array_non_zero_int
  #define array_non_zero_int array_non_zero_int_mmx
  static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
diff --git a/encoder/cabac.c b/encoder/cabac.c

index d904385d297ddf7ba38c4814e6c52ce637a4eaf1..4c6d94f3e5e6077efdf06e96da4417fb3b9f01b3 100644 (file)
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -1073,29 +1073,15 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
          if( h->mb.i_cbp_luma & (1 << i8) )
          {
              if( h->mb.b_transform_8x8 )
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
                  block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
-            }
              else
              {
                  int i4;
                  for( i4 = 0; i4 < 4; i4++ )
-                {
-                    h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
                      block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
-                }
              }
          }
-        else
-        {
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
-        }
  
-        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] );
-        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] );
          block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
          block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
  
@@ -1106,14 +1092,12 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
  static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
  {
      int b_8x4 = i_pixel == PIXEL_8x4;
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
      block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
      if( i_pixel == PIXEL_4x4 )
          x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
      else
      {
          x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
-        h->mb.cache.non_zero_count[x264_scan8[i4+2-b_8x4]] = array_non_zero( h->dct.luma4x4[i4+2-b_8x4] );
          block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16 );
      }
  }
diff --git a/encoder/cavlc.c b/encoder/cavlc.c

index a027751f95494c0483313d3b0f5f1cabd639b95c..ee87c54a385644d07ff52c64abc3563fc169a664 100644 (file)
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -56,25 +56,70 @@ static const uint8_t sub_mb_type_b_to_golomb[13]=
      10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0
  };
  
-static inline void bs_write_vlc( bs_t *s, vlc_t v )
-{
-    bs_write( s, v.i_size, v.i_bits );
-}
+#define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits )
  
  /****************************************************************************
   * block_residual_write_cavlc:
   ****************************************************************************/
+static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_suffix_length, int level )
+{
+    static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
+    int i_level_prefix = 15;
+    int mask = level >> 15;
+    int abs_level = (level^mask)-mask;
+    int i_level_code = abs_level*2-mask-2;
+    if( ( i_level_code >> i_suffix_length ) < 15 )
+    {
+        bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
+                 (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
+    }
+    else
+    {
+        i_level_code -= 15 << i_suffix_length;
+        if( i_suffix_length == 0 )
+            i_level_code -= 15;
+
+        /* If the prefix size exceeds 15, High Profile is required. */
+        if( i_level_code >= 1<<12 )
+        {
+            if( h->sps->i_profile_idc >= PROFILE_HIGH )
+            {
+                while( i_level_code > 1<<(i_level_prefix-3) )
+                {
+                    i_level_code -= 1<<(i_level_prefix-3);
+                    i_level_prefix++;
+                }
+            }
+            else
+            {
+#if RDO_SKIP_BS
+                /* Weight highly against overflows. */
+                s->i_bits_encoded += 1000000;
+#else
+                x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
+                /* clip level, preserving sign */
+                i_level_code = (1<<12) - 2 + (i_level_code & 1);
+#endif
+            }
+        }
+        bs_write( s, i_level_prefix + 1, 1 );
+        bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
+    }
+    if( i_suffix_length == 0 )
+        i_suffix_length++;
+    if( abs_level > next_suffix[i_suffix_length] )
+        i_suffix_length++;
+    return i_suffix_length;
+}
+
  static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
  {
-    static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
+    static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
+    static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
      int level[16], run[16];
-    int i_total, i_trailing;
-    int i_total_zero;
-    int i_last;
+    int i_trailing, i_total_zero, i_last, i_suffix_length, i;
+    int i_total = 0;
      unsigned int i_sign;
-    int i;
-    int idx = 0;
-    int i_suffix_length;
      /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
      int nC = i_idx >= 25 ? 4 : ct_index[x264_mb_predict_non_zero_code( h, i_idx == 24 ? 0 : i_idx )];
  
@@ -85,97 +130,66 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
      }
  
      i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
-    i_sign = 0;
-    i_total = 0;
-    i_trailing = 0;
      i_total_zero = i_last + 1;
  
      /* level and run and total */
-    while( i_last >= 0 )
+    /* set these to 2 to allow branchless i_trailing calculation */
+    level[1] = 2;
+    level[2] = 2;
+    do
      {
          int r = 0;
-        level[idx] = l[i_last];
+        level[i_total] = l[i_last];
          while( --i_last >= 0 && l[i_last] == 0 )
              r++;
-        run[idx++] = r;
-    }
+        run[i_total++] = r;
+    } while( i_last >= 0 );
  
-    i_total = idx;
-    i_total_zero -= idx;
+    h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total;
  
-    i_trailing = X264_MIN(3, idx);
-    for( idx = 0; idx < i_trailing; idx++ )
-    {
-        if( (unsigned)(level[idx]+1) > 2 )
-        {
-            i_trailing = idx;
-            break;
-        }
-        i_sign <<= 1;
-        i_sign |= level[idx] < 0;
-    }
+    i_total_zero -= i_total;
+    i_trailing = ((((level[0]+1) | (1-level[0])) >> 31) & 1) // abs(level[0])>1
+               | ((((level[1]+1) | (1-level[1])) >> 31) & 2)
+               | ((((level[2]+1) | (1-level[2])) >> 31) & 4);
+    i_trailing = ctz_index[i_trailing];
+    i_sign = ((level[2] >> 31) & 1)
+           | ((level[1] >> 31) & 2)
+           | ((level[0] >> 31) & 4);
+    i_sign >>= 3-i_trailing;
  
      /* total/trailing */
      bs_write_vlc( s, x264_coeff_token[nC][i_total*4+i_trailing] );
  
-    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
-    if( i_trailing > 0 )
+    i_suffix_length = i_total > 10 && i_trailing < 3;
+    if( i_trailing > 0 || RDO_SKIP_BS )
          bs_write( s, i_trailing, i_sign );
-    for( i = i_trailing; i < i_total; i++ )
+
+    if( i_trailing < i_total )
      {
-        int mask = level[i] >> 15;
-        int abs_level = (level[i]^mask)-mask;
-        int i_level_code = abs_level*2-mask-2;
-
-        if( i == i_trailing && i_trailing < 3 )
-            i_level_code -= 2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
-
-        if( ( i_level_code >> i_suffix_length ) < 14 )
-            bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
-                     (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
-        else if( i_suffix_length == 0 && i_level_code < 30 )
-            bs_write( s, 19, (1<<4) + (i_level_code - 14) );
-        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
-            bs_write( s, 15 + i_suffix_length,
-                      (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
+        int16_t val = level[i_trailing];
+        int16_t val_original = level[i_trailing]+LEVEL_TABLE_SIZE/2;
+        if( i_trailing < 3 )
+            val -= (val>>15)|1; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        val += LEVEL_TABLE_SIZE/2;
+
+        if( (unsigned)val_original < LEVEL_TABLE_SIZE )
+        {
+            bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
+            i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
+        }
          else
+            i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+        for( i = i_trailing+1; i < i_total; i++ )
          {
-            int i_level_prefix = 15;
-            i_level_code -= 15 << i_suffix_length;
-            if( i_suffix_length == 0 )
-                i_level_code -= 15;
-
-            /* If the prefix size exceeds 15, High Profile is required. */
-            if( i_level_code >= 1<<12 )
+            val = level[i] + LEVEL_TABLE_SIZE/2;
+            if( (unsigned)val < LEVEL_TABLE_SIZE )
              {
-                if( h->sps->i_profile_idc >= PROFILE_HIGH )
-                {
-                    while( i_level_code > 1<<(i_level_prefix-3) )
-                    {
-                        i_level_code -= 1<<(i_level_prefix-3);
-                        i_level_prefix++;
-                    }
-                }
-                else
-                {
-#if RDO_SKIP_BS
-                    /* Weight highly against overflows. */
-                    s->i_bits_encoded += 1000000;
-#else
-                    x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
-                    /* clip level, preserving sign */
-                    i_level_code = (1<<12) - 2 + (i_level_code & 1);
-#endif
-                }
+                bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
+                i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
              }
-            bs_write( s, i_level_prefix + 1, 1 );
-            bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
+            else
+                i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
          }
-
-        if( i_suffix_length == 0 )
-            i_suffix_length++;
-        if( abs_level > (3 << (i_suffix_length-1)) && i_suffix_length < 6 )
-            i_suffix_length++;
      }
  
      if( i_total < i_count )
@@ -269,16 +283,17 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
          /* shuffle 8x8 dct coeffs into 4x4 lists */
          for( i8 = i8start; i8 <= i8end; i8++ )
              if( h->mb.i_cbp_luma & (1 << i8) )
+            {
                  h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
+                for( i4 = 0; i4 < 4; i4++ )
+                    h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
+            }
      }
  
      for( i8 = i8start; i8 <= i8end; i8++ )
          if( h->mb.i_cbp_luma & (1 << i8) )
              for( i4 = 0; i4 < 4; i4++ )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
                  block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
-            }
  }
  
  /*****************************************************************************
@@ -595,10 +610,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
          /* AC Luma */
          if( h->mb.i_cbp_luma )
              for( i = 0; i < 16; i++ )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                  block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 );
-            }
      }
      else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
      {
@@ -612,10 +624,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
          block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
          if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
              for( i = 16; i < 24; i++ )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                  block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
-            }
      }
  
  #if !RDO_SKIP_BS
@@ -663,9 +672,7 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
      for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
      {
          x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
-        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
          block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
-        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
          block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
          i8 += x264_pixel_size[i_pixel].h >> 3;
      }
@@ -679,12 +686,10 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
      int b_8x4 = i_pixel == PIXEL_8x4;
      s.i_bits_encoded = 0;
      cavlc_mb_mvd( h, &s, 0, i4, 1+b_8x4 );
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
      block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
      if( i_pixel != PIXEL_4x4 )
      {
          i4 += 2-b_8x4;
-        h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
          block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
      }
  
@@ -706,7 +711,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
      h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
      for( i4 = 0; i4 < 4; i4++ )
      {
-        h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
+        h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
          block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
      }
      return h->out.bs.i_bits_encoded;
@@ -715,7 +720,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
  static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
  {
      h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
+    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
      block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
      return h->out.bs.i_bits_encoded;
  }
@@ -732,10 +737,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
          {
              int i;
              for( i = 16; i < 24; i++ )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                  block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
-            }
          }
      }
      return h->out.bs.i_bits_encoded;
diff --git a/encoder/encoder.c b/encoder/encoder.c

index c1de199ee5ecf4ce9748c242b7ae5fde754fca0c..73d33f4a61c7de2ffdf0fc7788a2e9187795dba7 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -730,7 +730,8 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
      x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
      x264_predict_8x8_init( h->param.cpu, h->predict_8x8 );
      x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
-
+    if( !h->param.b_cabac );
+        x264_init_vlc_tables();
      x264_pixel_init( h->param.cpu, &h->pixf );
      x264_dct_init( h->param.cpu, &h->dctf );
      x264_zigzag_init( h->param.cpu, &h->zigzagf, h->param.b_interlaced );
diff --git a/encoder/macroblock.c b/encoder/macroblock.c

index 42c931af4dce02a3338b057395ac2798bfe9bc0d..11801b78d7fa0fccba5255d49e967f7a67cad40e 100644 (file)
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -855,15 +855,20 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
          {
              h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
              nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
+            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101 * nnz8x8;
+            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101 * nnz8x8;
          }
          else
          {
              for( i4 = i8*4; i4 < i8*4+4; i4++ )
              {
+                int nz;
                  h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
                                      h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
                                      h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
-                nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] );
+                nz = array_non_zero( h->dct.luma4x4[i4] );
+                h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
+                nnz8x8 |= nz;
              }
          }
          for( ch = 0; ch < 2; ch++ )
@@ -872,6 +877,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
              p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
              h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
              h->dct.luma4x4[16+i8+ch*4][0] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
          }
      }
      else
@@ -892,6 +898,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
              {
                  h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
                  h->dctf.add8x8_idct8( p_fdec, dct8x8 );
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+            }
+            else
+            {
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
              }
          }
          else
@@ -918,9 +931,17 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
              if( nnz8x8 )
              {
                  for( i4 = 0; i4 < 4; i4++ )
+                {
                      h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
+                    h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = array_non_zero( dct4x4[i4] );
+                }
                  h->dctf.add8x8_idct( p_fdec, dct4x4 );
              }
+            else
+            {
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
+            }
          }
  
          i_qp = h->mb.i_chroma_qp;
@@ -944,7 +965,10 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
              {
                  h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
                  h->dctf.add4x4_idct( p_fdec, dct4x4 );
+                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 1;
              }
+            else
+                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 0;
          }
      }
      h->mb.i_cbp_luma &= ~(1 << i8);
@@ -967,7 +991,10 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
      h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
  
      if( h->mb.b_lossless )
+    {
          h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
+        h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
+    }
      else
      {
          DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
@@ -978,6 +1005,9 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
          {
              h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
              h->dctf.add4x4_idct( p_fdec, dct4x4 );
+            h->mb.cache.non_zero_count[x264_scan8[i4]] = 1;
          }
+        else
+            h->mb.cache.non_zero_count[x264_scan8[i4]] = 0;
      }
  }
author	Fiona Glaser <fiona@x264.com>
	Mon, 8 Dec 2008 21:44:23 +0000 (13:44 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Thu, 11 Dec 2008 06:48:18 +0000 (22:48 -0800)
common/bs.h		patch \| blob \| history
common/common.h		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/vlc.c		patch \| blob \| history
common/x86/util.h		patch \| blob \| history
encoder/cabac.c		patch \| blob \| history
encoder/cavlc.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/macroblock.c		patch \| blob \| history