Fix some invalid reads caught by valgrind

[x264] / encoder / cavlc.c
diff --git a/encoder/cavlc.c b/encoder/cavlc.c

index e499fac598350df3161c24d56a348e9ca243b85d..f0479aa95366810c5e8fea061a0fafd2f368f3d0 100644 (file)
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -112,22 +112,14 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_s
      return i_suffix_length;
  }
  
-static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
+static int block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, int16_t *l, int nC )
  {
-    static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
      static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
+    static const int count_cat[5] = {16, 15, 16, 4, 15};
      x264_run_level_t runlevel;
      int i_trailing, i_total_zero, i_suffix_length, i;
      int i_total = 0;
      unsigned int i_sign;
-    /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
-    int nC = i_idx >= 25 ? 4 : ct_index[x264_mb_predict_non_zero_code( h, i_idx == 24 ? 0 : i_idx )];
-
-    if( !h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
-    {
-        bs_write_vlc( s, x264_coeff0_token[nC] );
-        return;
-    }
  
      /* level and run and total */
      /* set these to 2 to allow branchless i_trailing calculation */
@@ -136,8 +128,6 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
      i_total = h->quantf.coeff_level_run[i_ctxBlockCat]( l, &runlevel );
      i_total_zero = runlevel.last + 1 - i_total;
  
-    h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total;
-
      i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
                 | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
                 | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
@@ -151,8 +141,7 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
      bs_write_vlc( s, x264_coeff_token[nC][i_total*4+i_trailing-4] );
  
      i_suffix_length = i_total > 10 && i_trailing < 3;
-    if( i_trailing > 0 || RDO_SKIP_BS )
-        bs_write( s, i_trailing, i_sign );
+    bs_write( s, i_trailing, i_sign );
  
      if( i_trailing < i_total )
      {
@@ -182,9 +171,9 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
          }
      }
  
-    if( i_total < i_count )
+    if( i_total < count_cat[i_ctxBlockCat] )
      {
-        if( i_idx >= 25 )
+        if( i_ctxBlockCat == DCT_CHROMA_DC )
              bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
          else
              bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
@@ -192,10 +181,24 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
  
      for( i = 0; i < i_total-1 && i_total_zero > 0; i++ )
      {
-        int i_zl = X264_MIN( i_total_zero - 1, 6 );
-        bs_write_vlc( s, x264_run_before[i_zl][runlevel.run[i]] );
+        int i_zl = X264_MIN( i_total_zero, 7 );
+        bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
          i_total_zero -= runlevel.run[i];
      }
+
+    return i_total;
+}
+
+static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
+
+#define block_residual_write_cavlc(h,s,cat,idx,l)\
+{\
+    int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? 0 : idx )];\
+    uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
+    if( !*nnz )\
+        bs_write_vlc( s, x264_coeff0_token[nC] );\
+    else\
+        *nnz = block_residual_write_cavlc(h,s,cat,l,nC);\
  }
  
  static void cavlc_qp_delta( x264_t *h, bs_t *s )
@@ -224,43 +227,32 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s )
  
  static void cavlc_mb_mvd( x264_t *h, bs_t *s, int i_list, int idx, int width )
  {
-    DECLARE_ALIGNED_4( int16_t mvp[2] );
+    ALIGNED_4( int16_t mvp[2] );
      x264_mb_predict_mv( h, i_list, idx, width, mvp );
      bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
      bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
  }
  
-static void cavlc_mb8x8_mvd( x264_t *h, bs_t *s, int i_list, int i )
+static inline void cavlc_mb8x8_mvd( x264_t *h, bs_t *s, int i )
  {
-    if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-        return;
-
      switch( h->mb.i_sub_partition[i] )
      {
          case D_L0_8x8:
-        case D_L1_8x8:
-        case D_BI_8x8:
-            cavlc_mb_mvd( h, s, i_list, 4*i, 2 );
+            cavlc_mb_mvd( h, s, 0, 4*i, 2 );
              break;
          case D_L0_8x4:
-        case D_L1_8x4:
-        case D_BI_8x4:
-            cavlc_mb_mvd( h, s, i_list, 4*i+0, 2 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+2, 2 );
+            cavlc_mb_mvd( h, s, 0, 4*i+0, 2 );
+            cavlc_mb_mvd( h, s, 0, 4*i+2, 2 );
              break;
          case D_L0_4x8:
-        case D_L1_4x8:
-        case D_BI_4x8:
-            cavlc_mb_mvd( h, s, i_list, 4*i+0, 1 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+1, 1 );
+            cavlc_mb_mvd( h, s, 0, 4*i+0, 1 );
+            cavlc_mb_mvd( h, s, 0, 4*i+1, 1 );
              break;
          case D_L0_4x4:
-        case D_L1_4x4:
-        case D_BI_4x4:
-            cavlc_mb_mvd( h, s, i_list, 4*i+0, 1 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+1, 1 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+2, 1 );
-            cavlc_mb_mvd( h, s, i_list, 4*i+3, 1 );
+            cavlc_mb_mvd( h, s, 0, 4*i+0, 1 );
+            cavlc_mb_mvd( h, s, 0, 4*i+1, 1 );
+            cavlc_mb_mvd( h, s, 0, 4*i+2, 1 );
+            cavlc_mb_mvd( h, s, 0, 4*i+3, 1 );
              break;
      }
  }
@@ -273,17 +265,13 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
          /* shuffle 8x8 dct coeffs into 4x4 lists */
          for( i8 = i8start; i8 <= i8end; i8++ )
              if( h->mb.i_cbp_luma & (1 << i8) )
-            {
-                h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
-                for( i4 = 0; i4 < 4; i4++ )
-                    h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
-            }
+                h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
      }
  
      for( i8 = i8start; i8 <= i8end; i8++ )
          if( h->mb.i_cbp_luma & (1 << i8) )
              for( i4 = 0; i4 < 4; i4++ )
-                block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
+                block_residual_write_cavlc( h, s, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
  }
  
  /*****************************************************************************
@@ -292,7 +280,8 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
  void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
  {
      const int i_mb_type = h->mb.i_type;
-    int i_mb_i_offset;
+    static const int i_offsets[3] = {5,23,0};
+    int i_mb_i_offset = i_offsets[h->sh.i_type];
      int i;
  
  #if !RDO_SKIP_BS
@@ -300,22 +289,6 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
      int       i_mb_pos_tex;
  #endif
  
-    switch( h->sh.i_type )
-    {
-        case SLICE_TYPE_I:
-            i_mb_i_offset = 0;
-            break;
-        case SLICE_TYPE_P:
-            i_mb_i_offset = 5;
-            break;
-        case SLICE_TYPE_B:
-            i_mb_i_offset = 23;
-            break;
-        default:
-            x264_log(h, X264_LOG_ERROR, "internal error or slice unsupported\n" );
-            return;
-    }
-
      if( h->sh.b_mbaff
          && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
      {
@@ -323,8 +296,9 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
      }
  
  #if !RDO_SKIP_BS
-    if( i_mb_type == I_PCM)
+    if( i_mb_type == I_PCM )
      {
+        uint8_t *p_start = s->p_start;
          bs_write_ue( s, i_mb_i_offset + 25 );
          i_mb_pos_tex = bs_pos( s );
          h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
@@ -340,6 +314,9 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
              memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
          s->p += 64;
  
+        bs_init( s, s->p, s->p_end - s->p );
+        s->p_start = p_start;
+
          /* if PCM is chosen, we need to store reconstructed frame data */
          h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
          h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
@@ -382,17 +359,13 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
      }
      else if( i_mb_type == P_L0 )
      {
-        DECLARE_ALIGNED_4( int16_t mvp[2] );
-
          if( h->mb.i_partition == D_16x16 )
          {
-            bs_write_ue( s, 0 );
+            bs_write1( s, 1 );
  
              if( h->mb.pic.i_fref[0] > 1 )
                  bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-            x264_mb_predict_mv( h, 0, 0, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+            cavlc_mb_mvd( h, s, 0, 0, 4 );
          }
          else if( h->mb.i_partition == D_16x8 )
          {
@@ -402,14 +375,8 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
                  bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
                  bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
              }
-
-            x264_mb_predict_mv( h, 0, 0, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
-
-            x264_mb_predict_mv( h, 0, 8, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][1] - mvp[1] );
+            cavlc_mb_mvd( h, s, 0, 0, 4 );
+            cavlc_mb_mvd( h, s, 0, 8, 4 );
          }
          else if( h->mb.i_partition == D_8x16 )
          {
@@ -419,14 +386,8 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
                  bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
                  bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
              }
-
-            x264_mb_predict_mv( h, 0, 0, 2, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
-
-            x264_mb_predict_mv( h, 0, 4, 2, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][1] - mvp[1] );
+            cavlc_mb_mvd( h, s, 0, 0, 2 );
+            cavlc_mb_mvd( h, s, 0, 4, 2 );
          }
      }
      else if( i_mb_type == P_8x8 )
@@ -441,7 +402,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
          else
          {
              bs_write_ue( s, 3 );
-            b_sub_ref = h->mb.pic.i_fref[0] > 1;
+            b_sub_ref = 1;
          }
  
          /* sub mb type */
@@ -461,7 +422,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
          }
  
          for( i = 0; i < 4; i++ )
-            cavlc_mb8x8_mvd( h, s, 0, i );
+            cavlc_mb8x8_mvd( h, s, i );
      }
      else if( i_mb_type == B_8x8 )
      {
@@ -472,99 +433,63 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
              bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
  
          /* ref */
-        for( i = 0; i < 4; i++ )
-            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
-                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
-        for( i = 0; i < 4; i++ )
-            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
-                bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
+        if( h->mb.pic.i_fref[0] > 1 )
+            for( i = 0; i < 4; i++ )
+                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                    bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
+        if( h->mb.pic.i_fref[1] > 1 )
+            for( i = 0; i < 4; i++ )
+                if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+                    bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
  
          /* mvd */
          for( i = 0; i < 4; i++ )
-            cavlc_mb8x8_mvd( h, s, 0, i );
+            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                cavlc_mb_mvd( h, s, 0, 4*i, 2 );
          for( i = 0; i < 4; i++ )
-            cavlc_mb8x8_mvd( h, s, 1, i );
+            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+                cavlc_mb_mvd( h, s, 1, 4*i, 2 );
      }
      else if( i_mb_type != B_DIRECT )
      {
          /* All B mode */
          /* Motion Vector */
-        int i_list;
-        DECLARE_ALIGNED_4( int16_t mvp[2] );
          const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
+        const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
+        const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
  
          bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
-
-        for( i_list = 0; i_list < 2; i_list++ )
+        if( h->mb.i_partition == D_16x16 )
          {
-            const int i_ref_max = (i_list == 0 ? h->mb.pic.i_fref[0] : h->mb.pic.i_fref[1]) - 1;
-
-            if( i_ref_max )
-                switch( h->mb.i_partition )
-                {
-                    case D_16x16:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        break;
-                    case D_16x8:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[8]] );
-                        break;
-                    case D_8x16:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max, h->mb.cache.ref[i_list][x264_scan8[4]] );
-                        break;
-                }
+            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
+            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
+            if( b_list[0][0] ) cavlc_mb_mvd( h, s, 0, 0, 4 );
+            if( b_list[1][0] ) cavlc_mb_mvd( h, s, 1, 0, 4 );
          }
-        for( i_list = 0; i_list < 2; i_list++ )
+        else
          {
-            switch( h->mb.i_partition )
+            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
+            if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
+            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
+            if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
+            if( h->mb.i_partition == D_16x8 )
              {
-                case D_16x16:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    break;
-                case D_16x8:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    if( b_list[i_list][1] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 8, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][1] - mvp[1] );
-                    }
-                    break;
-                case D_8x16:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 2, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    if( b_list[i_list][1] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 4, 2, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][1] - mvp[1] );
-                    }
-                    break;
+                if( b_list[0][0] ) cavlc_mb_mvd( h, s, 0, 0, 4 );
+                if( b_list[0][1] ) cavlc_mb_mvd( h, s, 0, 8, 4 );
+                if( b_list[1][0] ) cavlc_mb_mvd( h, s, 1, 0, 4 );
+                if( b_list[1][1] ) cavlc_mb_mvd( h, s, 1, 8, 4 );
+            }
+            else //if( h->mb.i_partition == D_8x16 )
+            {
+                if( b_list[0][0] ) cavlc_mb_mvd( h, s, 0, 0, 2 );
+                if( b_list[0][1] ) cavlc_mb_mvd( h, s, 0, 4, 2 );
+                if( b_list[1][0] ) cavlc_mb_mvd( h, s, 1, 0, 2 );
+                if( b_list[1][1] ) cavlc_mb_mvd( h, s, 1, 4, 2 );
              }
          }
      }
-    else if( i_mb_type == B_DIRECT )
-        bs_write_ue( s, 0 );
-    else
-    {
-        x264_log(h, X264_LOG_ERROR, "invalid/unhandled mb_type\n" );
-        return;
-    }
+    else //if( i_mb_type == B_DIRECT )
+        bs_write1( s, 1 );
  
  #if !RDO_SKIP_BS
      i_mb_pos_tex = bs_pos( s );
@@ -587,12 +512,12 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
          cavlc_qp_delta( h, s );
  
          /* DC Luma */
-        block_residual_write_cavlc( h, s, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc, 16 );
+        block_residual_write_cavlc( h, s, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
  
          /* AC Luma */
          if( h->mb.i_cbp_luma )
              for( i = 0; i < 16; i++ )
-                block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 );
+                block_residual_write_cavlc( h, s, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
      }
      else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
      {
@@ -602,11 +527,11 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
      if( h->mb.i_cbp_chroma )
      {
          /* Chroma DC residual present */
-        block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
-        block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
+        block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
+        block_residual_write_cavlc( h, s, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
          if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
              for( i = 16; i < 24; i++ )
-                block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
+                block_residual_write_cavlc( h, s, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
      }
  
  #if !RDO_SKIP_BS
@@ -618,64 +543,61 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
  /*****************************************************************************
   * RD only; doesn't generate a valid bitstream
   * doesn't write cbp or chroma dc (I don't know how much this matters)
- * doesn't write ref or subpartition (never varies between calls, so no point in doing so)
+ * doesn't write ref (never varies between calls, so no point in doing so)
+ * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
   * works on all partition sizes except 16x16
- * for sub8x8, call once per 8x8 block
   *****************************************************************************/
  static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
  {
-    bs_t s;
      const int i_mb_type = h->mb.i_type;
      int b_8x16 = h->mb.i_partition == D_8x16;
      int j;
-
-    s.i_bits_encoded = 0;
+    h->out.bs.i_bits_encoded = 0;
  
      if( i_mb_type == P_8x8 )
-        cavlc_mb8x8_mvd( h, &s, 0, i8 );
-    else if( i_mb_type == P_L0 )
-        cavlc_mb_mvd( h, &s, 0, 4*i8, 4>>b_8x16 );
-    else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
      {
-        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, &s, 0, 4*i8, 4>>b_8x16 );
-        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, &s, 1, 4*i8, 4>>b_8x16 );
+        cavlc_mb8x8_mvd( h, &h->out.bs, i8 );
+        bs_write_ue( &h->out.bs, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
      }
-    else if( i_mb_type == B_8x8 )
+    else if( i_mb_type == P_L0 )
+        cavlc_mb_mvd( h, &h->out.bs, 0, 4*i8, 4>>b_8x16 );
+    else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
      {
-        cavlc_mb8x8_mvd( h, &s, 0, i8 );
-        cavlc_mb8x8_mvd( h, &s, 1, i8 );
+        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, &h->out.bs, 0, 4*i8, 4>>b_8x16 );
+        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, &h->out.bs, 1, 4*i8, 4>>b_8x16 );
      }
-    else
+    else //if( i_mb_type == B_8x8 )
      {
-        x264_log(h, X264_LOG_ERROR, "invalid/unhandled mb_type\n" );
-        return 0;
+        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+            cavlc_mb_mvd( h, &h->out.bs, 0, 4*i8, 2 );
+        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+            cavlc_mb_mvd( h, &h->out.bs, 1, 4*i8, 2 );
      }
  
      for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
      {
-        x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
-        block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
-        block_residual_write_cavlc( h, &s, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
+        x264_macroblock_luma_write_cavlc( h, &h->out.bs, i8, i8 );
+        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1 );
          i8 += x264_pixel_size[i_pixel].h >> 3;
      }
  
-    return s.i_bits_encoded;
+    return h->out.bs.i_bits_encoded;
  }
  
  static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
  {
-    bs_t s;
      int b_8x4 = i_pixel == PIXEL_8x4;
-    s.i_bits_encoded = 0;
-    cavlc_mb_mvd( h, &s, 0, i4, 1+b_8x4 );
-    block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
+    h->out.bs.i_bits_encoded = 0;
+    cavlc_mb_mvd( h, &h->out.bs, 0, i4, 1+b_8x4 );
+    block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
      if( i_pixel != PIXEL_4x4 )
      {
          i4 += 2-b_8x4;
-        block_residual_write_cavlc( h, &s, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
+        block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
      }
  
-    return s.i_bits_encoded;
+    return h->out.bs.i_bits_encoded;
  }
  
  static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
@@ -688,21 +610,16 @@ static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
  
  static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
  {
-    int i4;
      h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
-    h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
-    for( i4 = 0; i4 < 4; i4++ )
-    {
-        h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
-        block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
-    }
+    bs_write_ue( &h->out.bs, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    x264_macroblock_luma_write_cavlc( h, &h->out.bs, i8, i8 );
      return h->out.bs.i_bits_encoded;
  }
  
  static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
  {
      h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
-    block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 );
+    block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
      return h->out.bs.i_bits_encoded;
  }
  
@@ -711,14 +628,14 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
      h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
      if( h->mb.i_cbp_chroma )
      {
-        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 );
-        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 );
+        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
+        block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
  
          if( h->mb.i_cbp_chroma == 2 )
          {
              int i;
              for( i = 16; i < 24; i++ )
-                block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 );
+                block_residual_write_cavlc( h, &h->out.bs, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
          }
      }
      return h->out.bs.i_bits_encoded;