]> git.sesse.net Git - x264/commitdiff
Much faster CAVLC residual coding
authorFiona Glaser <fiona@x264.com>
Mon, 8 Dec 2008 21:44:23 +0000 (13:44 -0800)
committerFiona Glaser <fiona@x264.com>
Thu, 11 Dec 2008 06:48:18 +0000 (22:48 -0800)
Use a VLC table for common levelcodes instead of constructing them on-the-spot
Branchless version of i_trailing calculation (2x faster on Nehalem)
Completely remove array_non_zero_count and instead use the count calculated in level/run coding.  Note: this slightly changes output with subme > 7 due to different nonzero counts being stored during qpel RD.

common/bs.h
common/common.h
common/macroblock.h
common/vlc.c
common/x86/util.h
encoder/cabac.c
encoder/cavlc.c
encoder/encoder.c
encoder/macroblock.c

index f1be7a829618b672ccf6b90997660ce472f6bb0c..613b2b0e75bba8d74d361ac1bc17cf6f02051e71 100644 (file)
@@ -31,6 +31,14 @@ typedef struct
     uint8_t i_size;
 } vlc_t;
 
+typedef struct
+{
+    uint16_t i_bits;
+    uint8_t  i_size;
+    /* Next level table to use */
+    uint8_t  i_next;
+} vlc_large_t;
+
 typedef struct bs_s
 {
     uint8_t *p_start;
@@ -47,6 +55,14 @@ extern const vlc_t x264_total_zeros[15][16];
 extern const vlc_t x264_total_zeros_dc[3][4];
 extern const vlc_t x264_run_before[7][15];
 
+/* A larger level table size theoretically could help a bit at extremely
+ * high bitrates, but the cost in cache is usually too high for it to be
+ * useful.
+ * This size appears to be optimal for QP18 encoding on a Nehalem CPU.
+ * FIXME: Do further testing? */
+#define LEVEL_TABLE_SIZE 128
+extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+
 static inline void bs_init( bs_t *s, void *p_data, int i_data )
 {
     int offset = ((intptr_t)p_data & (WORD_SIZE-1));
index c8405fc3242a5db29be5a32b7eb37d7dbeda2134..f2a0c54a9bc5f2c6b8659b693fdfd4dd3cb01eea 100644 (file)
@@ -99,6 +99,7 @@ char *x264_param2string( x264_param_t *p, int b_res );
 void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
 
 void x264_reduce_fraction( int *n, int *d );
+void x264_init_vlc_tables();
 
 static inline uint8_t x264_clip_uint8( int x )
 {
index f38c047836b38389978d3381a65761015095886e..708c449fdd49cdbf68b7e4f2484719da956241d5 100644 (file)
@@ -424,18 +424,6 @@ static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
         return 0;
     }
 }
-/* This function and its MMX version only work on arrays of size 16 */
-static ALWAYS_INLINE int array_non_zero_count( int16_t *v )
-{
-    int i;
-    int i_nz;
-
-    for( i = 0, i_nz = 0; i < 16; i++ )
-        if( v[i] )
-            i_nz++;
-
-    return i_nz;
-}
 static inline int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
 {
     const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
index 26ab90f9153730a8e71c30c9b6de0699a0df1551..6c159631436329381f302200e34bcc58adbc73a2 100644 (file)
@@ -884,3 +884,49 @@ const vlc_t x264_run_before[7][15] =
         MKVLC( 0x1, 11 ), /* str=00000000001 */
     },
 };
+
+vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+
+void x264_init_vlc_tables()
+{
+    int16_t level;
+    int i_suffix;
+    for( i_suffix = 0; i_suffix < 7; i_suffix++ )
+        for( level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
+        {
+            int mask = level >> 15;
+            int abs_level = (level^mask)-mask;
+            int i_level_code = abs_level*2-mask-2;
+            int i_next = i_suffix;
+            vlc_large_t *vlc = &x264_level_token[i_suffix][level+LEVEL_TABLE_SIZE/2];
+
+            if( ( i_level_code >> i_suffix ) < 14 )
+            {
+                vlc->i_size = (i_level_code >> i_suffix) + 1 + i_suffix;
+                vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
+            }
+            else if( i_suffix == 0 && i_level_code < 30 )
+            {
+                vlc->i_size = 19;
+                vlc->i_bits = (1<<4) + (i_level_code - 14);
+            }
+            else if( i_suffix > 0 && ( i_level_code >> i_suffix ) == 14 )
+            {
+                vlc->i_size = 15 + i_suffix;
+                vlc->i_bits = (1<<i_suffix) + (i_level_code & ((1<<i_suffix)-1));
+            }
+            else
+            {
+                i_level_code -= 15 << i_suffix;
+                if( i_suffix == 0 )
+                    i_level_code -= 15;
+                vlc->i_size = 28;
+                vlc->i_bits = (1<<12) + i_level_code;
+            }
+            if( i_next == 0 )
+                i_next++;
+            if( abs_level > (3 << (i_next-1)) && i_next < 6 )
+                i_next++;
+            vlc->i_next = i_next;
+        }
+}
index 07317a24e31a2a04ac6faa24c1fab7f075065f6a..b86f37aa181c43b47270799cfc9e688bcef53fe7 100644 (file)
@@ -74,26 +74,6 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
     sum += output[0] + output[1] + output[2] + output[3];
     return sum;
 }
-#define array_non_zero_count array_non_zero_count_mmx
-static inline int array_non_zero_count_mmx( int16_t *v )
-{
-    int count;
-    asm(
-        "pxor     %%mm7,  %%mm7 \n"
-        "movq     (%1),   %%mm0 \n"
-        "movq     8(%1),  %%mm1 \n"
-        "packsswb 16(%1), %%mm0 \n"
-        "packsswb 24(%1), %%mm1 \n"
-        "pcmpeqb  %%mm7,  %%mm0 \n"
-        "pcmpeqb  %%mm7,  %%mm1 \n"
-        "paddb    %%mm0,  %%mm1 \n"
-        "psadbw   %%mm7,  %%mm1 \n"
-        "movd     %%mm1,  %0    \n"
-        :"=r"(count)
-        :"r"(v), "m"(*(struct {int16_t x[16];} *)v)
-    );
-    return (count+0x10)&0xff;
-}
 #undef array_non_zero_int
 #define array_non_zero_int array_non_zero_int_mmx
 static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
index d904385d297ddf7ba38c4814e6c52ce637a4eaf1..4c6d94f3e5e6077efdf06e96da4417fb3b9f01b3 100644 (file)
@@ -1073,29 +1073,15 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
         if( h->mb.i_cbp_luma & (1 << i8) )
         {
             if( h->mb.b_transform_8x8 )
-            {
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
-                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
                 block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
-            }
             else
             {
                 int i4;
                 for( i4 = 0; i4 < 4; i4++ )
-                {
-                    h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
                     block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
-                }
             }
         }
-        else
-        {
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
-        }
 
-        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] );
-        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] );
         block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
         block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
 
@@ -1106,14 +1092,12 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
 static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel )
 {
     int b_8x4 = i_pixel == PIXEL_8x4;
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
     block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     if( i_pixel == PIXEL_4x4 )
         x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 );
     else
     {
         x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 );
-        h->mb.cache.non_zero_count[x264_scan8[i4+2-b_8x4]] = array_non_zero( h->dct.luma4x4[i4+2-b_8x4] );
         block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16 );
     }
 }
index a027751f95494c0483313d3b0f5f1cabd639b95c..ee87c54a385644d07ff52c64abc3563fc169a664 100644 (file)
@@ -56,25 +56,70 @@ static const uint8_t sub_mb_type_b_to_golomb[13]=
     10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0
 };
 
-static inline void bs_write_vlc( bs_t *s, vlc_t v )
-{
-    bs_write( s, v.i_size, v.i_bits );
-}
+#define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits )
 
 /****************************************************************************
  * block_residual_write_cavlc:
  ****************************************************************************/
+static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_suffix_length, int level )
+{
+    static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
+    int i_level_prefix = 15;
+    int mask = level >> 15;
+    int abs_level = (level^mask)-mask;
+    int i_level_code = abs_level*2-mask-2;
+    if( ( i_level_code >> i_suffix_length ) < 15 )
+    {
+        bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
+                 (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
+    }
+    else
+    {
+        i_level_code -= 15 << i_suffix_length;
+        if( i_suffix_length == 0 )
+            i_level_code -= 15;
+
+        /* If the prefix size exceeds 15, High Profile is required. */
+        if( i_level_code >= 1<<12 )
+        {
+            if( h->sps->i_profile_idc >= PROFILE_HIGH )
+            {
+                while( i_level_code > 1<<(i_level_prefix-3) )
+                {
+                    i_level_code -= 1<<(i_level_prefix-3);
+                    i_level_prefix++;
+                }
+            }
+            else
+            {
+#if RDO_SKIP_BS
+                /* Weight highly against overflows. */
+                s->i_bits_encoded += 1000000;
+#else
+                x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
+                /* clip level, preserving sign */
+                i_level_code = (1<<12) - 2 + (i_level_code & 1);
+#endif
+            }
+        }
+        bs_write( s, i_level_prefix + 1, 1 );
+        bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
+    }
+    if( i_suffix_length == 0 )
+        i_suffix_length++;
+    if( abs_level > next_suffix[i_suffix_length] )
+        i_suffix_length++;
+    return i_suffix_length;
+}
+
 static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
 {
-    static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
+    static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
+    static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
     int level[16], run[16];
-    int i_total, i_trailing;
-    int i_total_zero;
-    int i_last;
+    int i_trailing, i_total_zero, i_last, i_suffix_length, i;
+    int i_total = 0;
     unsigned int i_sign;
-    int i;
-    int idx = 0;
-    int i_suffix_length;
     /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
     int nC = i_idx >= 25 ? 4 : ct_index[x264_mb_predict_non_zero_code( h, i_idx == 24 ? 0 : i_idx )];
 
@@ -85,97 +130,66 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
     }
 
     i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
-    i_sign = 0;
-    i_total = 0;
-    i_trailing = 0;
     i_total_zero = i_last + 1;
 
     /* level and run and total */
-    while( i_last >= 0 )
+    /* set these to 2 to allow branchless i_trailing calculation */
+    level[1] = 2;
+    level[2] = 2;
+    do
     {
         int r = 0;
-        level[idx] = l[i_last];
+        level[i_total] = l[i_last];
         while( --i_last >= 0 && l[i_last] == 0 )
             r++;
-        run[idx++] = r;
-    }
+        run[i_total++] = r;
+    } while( i_last >= 0 );
 
-    i_total = idx;
-    i_total_zero -= idx;
+    h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total;
 
-    i_trailing = X264_MIN(3, idx);
-    for( idx = 0; idx < i_trailing; idx++ )
-    {
-        if( (unsigned)(level[idx]+1) > 2 )
-        {
-            i_trailing = idx;
-            break;
-        }
-        i_sign <<= 1;
-        i_sign |= level[idx] < 0;
-    }
+    i_total_zero -= i_total;
+    i_trailing = ((((level[0]+1) | (1-level[0])) >> 31) & 1) // abs(level[0])>1
+               | ((((level[1]+1) | (1-level[1])) >> 31) & 2)
+               | ((((level[2]+1) | (1-level[2])) >> 31) & 4);
+    i_trailing = ctz_index[i_trailing];
+    i_sign = ((level[2] >> 31) & 1)
+           | ((level[1] >> 31) & 2)
+           | ((level[0] >> 31) & 4);
+    i_sign >>= 3-i_trailing;
 
     /* total/trailing */
     bs_write_vlc( s, x264_coeff_token[nC][i_total*4+i_trailing] );
 
-    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
-    if( i_trailing > 0 )
+    i_suffix_length = i_total > 10 && i_trailing < 3;
+    if( i_trailing > 0 || RDO_SKIP_BS )
         bs_write( s, i_trailing, i_sign );
-    for( i = i_trailing; i < i_total; i++ )
+
+    if( i_trailing < i_total )
     {
-        int mask = level[i] >> 15;
-        int abs_level = (level[i]^mask)-mask;
-        int i_level_code = abs_level*2-mask-2;
-
-        if( i == i_trailing && i_trailing < 3 )
-            i_level_code -= 2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
-
-        if( ( i_level_code >> i_suffix_length ) < 14 )
-            bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
-                     (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
-        else if( i_suffix_length == 0 && i_level_code < 30 )
-            bs_write( s, 19, (1<<4) + (i_level_code - 14) );
-        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
-            bs_write( s, 15 + i_suffix_length,
-                      (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
+        int16_t val = level[i_trailing];
+        int16_t val_original = level[i_trailing]+LEVEL_TABLE_SIZE/2;
+        if( i_trailing < 3 )
+            val -= (val>>15)|1; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+        val += LEVEL_TABLE_SIZE/2;
+
+        if( (unsigned)val_original < LEVEL_TABLE_SIZE )
+        {
+            bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
+            i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
+        }
         else
+            i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+        for( i = i_trailing+1; i < i_total; i++ )
         {
-            int i_level_prefix = 15;
-            i_level_code -= 15 << i_suffix_length;
-            if( i_suffix_length == 0 )
-                i_level_code -= 15;
-
-            /* If the prefix size exceeds 15, High Profile is required. */
-            if( i_level_code >= 1<<12 )
+            val = level[i] + LEVEL_TABLE_SIZE/2;
+            if( (unsigned)val < LEVEL_TABLE_SIZE )
             {
-                if( h->sps->i_profile_idc >= PROFILE_HIGH )
-                {
-                    while( i_level_code > 1<<(i_level_prefix-3) )
-                    {
-                        i_level_code -= 1<<(i_level_prefix-3);
-                        i_level_prefix++;
-                    }
-                }
-                else
-                {
-#if RDO_SKIP_BS
-                    /* Weight highly against overflows. */
-                    s->i_bits_encoded += 1000000;
-#else
-                    x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
-                    /* clip level, preserving sign */
-                    i_level_code = (1<<12) - 2 + (i_level_code & 1);
-#endif
-                }
+                bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
+                i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
             }
-            bs_write( s, i_level_prefix + 1, 1 );
-            bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
+            else
+                i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
         }
-
-        if( i_suffix_length == 0 )
-            i_suffix_length++;
-        if( abs_level > (3 << (i_suffix_length-1)) && i_suffix_length < 6 )
-            i_suffix_length++;
     }
 
     if( i_total < i_count )
@@ -269,16 +283,17 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
         /* shuffle 8x8 dct coeffs into 4x4 lists */
         for( i8 = i8start; i8 <= i8end; i8++ )
             if( h->mb.i_cbp_luma & (1 << i8) )
+            {
                 h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
+                for( i4 = 0; i4 < 4; i4++ )
+                    h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
+            }
     }
 
     for( i8 = i8start; i8 <= i8end; i8++ )
         if( h->mb.i_cbp_luma & (1 << i8) )
             for( i4 = 0; i4 < 4; i4++ )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
                 block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
-            }
 }
 
 /*****************************************************************************
@@ -595,10 +610,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         /* AC Luma */
         if( h->mb.i_cbp_luma )
             for( i = 0; i < 16; i++ )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                 block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 );
-            }
     }
     else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
     {
@@ -612,10 +624,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
         if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
             for( i = 16; i < 24; i++ )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                 block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
-            }
     }
 
 #if !RDO_SKIP_BS
@@ -663,9 +672,7 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
     for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
     {
         x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
-        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
         block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
-        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
         block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
         i8 += x264_pixel_size[i_pixel].h >> 3;
     }
@@ -679,12 +686,10 @@ static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
     int b_8x4 = i_pixel == PIXEL_8x4;
     s.i_bits_encoded = 0;
     cavlc_mb_mvd( h, &s, 0, i4, 1+b_8x4 );
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
     block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     if( i_pixel != PIXEL_4x4 )
     {
         i4 += 2-b_8x4;
-        h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
         block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     }
 
@@ -706,7 +711,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
     h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
     for( i4 = 0; i4 < 4; i4++ )
     {
-        h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
+        h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
         block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
     }
     return h->out.bs.i_bits_encoded;
@@ -715,7 +720,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
 static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
 {
     h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
-    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero_count( h->dct.luma4x4[i4] );
+    h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
     block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
     return h->out.bs.i_bits_encoded;
 }
@@ -732,10 +737,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
         {
             int i;
             for( i = 16; i < 24; i++ )
-            {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                 block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
-            }
         }
     }
     return h->out.bs.i_bits_encoded;
index c1de199ee5ecf4ce9748c242b7ae5fde754fca0c..73d33f4a61c7de2ffdf0fc7788a2e9187795dba7 100644 (file)
@@ -730,7 +730,8 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
     x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
     x264_predict_8x8_init( h->param.cpu, h->predict_8x8 );
     x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
-
+    if( !h->param.b_cabac );
+        x264_init_vlc_tables();
     x264_pixel_init( h->param.cpu, &h->pixf );
     x264_dct_init( h->param.cpu, &h->dctf );
     x264_zigzag_init( h->param.cpu, &h->zigzagf, h->param.b_interlaced );
index 42c931af4dce02a3338b057395ac2798bfe9bc0d..11801b78d7fa0fccba5255d49e967f7a67cad40e 100644 (file)
@@ -855,15 +855,20 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
         {
             h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
             nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
+            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101 * nnz8x8;
+            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101 * nnz8x8;
         }
         else
         {
             for( i4 = i8*4; i4 < i8*4+4; i4++ )
             {
+                int nz;
                 h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
                                     h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
                                     h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
-                nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] );
+                nz = array_non_zero( h->dct.luma4x4[i4] );
+                h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
+                nnz8x8 |= nz;
             }
         }
         for( ch = 0; ch < 2; ch++ )
@@ -872,6 +877,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
             h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
             h->dct.luma4x4[16+i8+ch*4][0] = 0;
+            h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
         }
     }
     else
@@ -892,6 +898,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             {
                 h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
                 h->dctf.add8x8_idct8( p_fdec, dct8x8 );
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
+            }
+            else
+            {
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
             }
         }
         else
@@ -918,9 +931,17 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             if( nnz8x8 )
             {
                 for( i4 = 0; i4 < 4; i4++ )
+                {
                     h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
+                    h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = array_non_zero( dct4x4[i4] );
+                }
                 h->dctf.add8x8_idct( p_fdec, dct4x4 );
             }
+            else
+            {
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0;
+                *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0;
+            }
         }
 
         i_qp = h->mb.i_chroma_qp;
@@ -944,7 +965,10 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             {
                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
+                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 1;
             }
+            else
+                h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 0;
         }
     }
     h->mb.i_cbp_luma &= ~(1 << i8);
@@ -967,7 +991,10 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
     h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
 
     if( h->mb.b_lossless )
+    {
         h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
+        h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
+    }
     else
     {
         DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
@@ -978,6 +1005,9 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
         {
             h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
             h->dctf.add4x4_idct( p_fdec, dct4x4 );
+            h->mb.cache.non_zero_count[x264_scan8[i4]] = 1;
         }
+        else
+            h->mb.cache.non_zero_count[x264_scan8[i4]] = 0;
     }
 }