]> git.sesse.net Git - x264/blobdiff - encoder/cavlc.c
Allow 8x8dct+cavlc+lossless with subme>=6
[x264] / encoder / cavlc.c
index a9715c2767dec696f6b3f156e0476212667d436c..c5798108ff866a2768002c5af05439c52f00dfe8 100644 (file)
@@ -1,10 +1,11 @@
 /*****************************************************************************
- * cavlc.c: h264 encoder library
+ * cavlc.c: cavlc bitstream writing
  *****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
- * $Id: cavlc.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
+ * Copyright (C) 2003-2010 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
  *****************************************************************************/
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
 #include "common/common.h"
-#include "common/vlc.h"
 #include "macroblock.h"
 
+#ifndef RDO_SKIP_BS
+#define RDO_SKIP_BS 0
+#endif
+
 static const uint8_t intra4x4_cbp_to_golomb[48]=
 {
   3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11,  2,
@@ -56,644 +59,584 @@ static const uint8_t sub_mb_type_b_to_golomb[13]=
     10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0
 };
 
-static const uint8_t block_idx_x[16] =
-{
-    0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
-};
-static const uint8_t block_idx_y[16] =
-{
-    0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
-};
-static const uint8_t block_idx_xy[4][4] =
-{
-    { 0, 2, 8,  10},
-    { 1, 3, 9,  11},
-    { 4, 6, 12, 14},
-    { 5, 7, 13, 15}
-};
-
-#define BLOCK_INDEX_CHROMA_DC   (-1)
-#define BLOCK_INDEX_LUMA_DC     (-2)
-
-static inline void bs_write_vlc( bs_t *s, vlc_t v )
-{
-    bs_write( s, v.i_size, v.i_bits );
-}
+#define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits )
 
 /****************************************************************************
  * block_residual_write_cavlc:
  ****************************************************************************/
-static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int *l, int i_count )
+static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_length, int level )
 {
-    int level[16], run[16];
-    int i_total, i_trailing;
-    int i_total_zero;
-    int i_last;
-    unsigned int i_sign;
-
-    int i;
-    int i_zero_left;
-    int i_suffix_length;
-
-    /* first find i_last */
-    i_last = i_count - 1;
-    while( i_last >= 0 && l[i_last] == 0 )
+    bs_t *s = &h->out.bs;
+    static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
+    int i_level_prefix = 15;
+    int mask = level >> 31;
+    int abs_level = (level^mask)-mask;
+    int i_level_code = abs_level*2-mask-2;
+    if( ( i_level_code >> i_suffix_length ) < 15 )
     {
-        i_last--;
+        bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
+                 (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
     }
-
-    i_sign = 0;
-    i_total = 0;
-    i_trailing = 0;
-    i_total_zero = 0;
-
-    if( i_last >= 0 )
+    else
     {
-        int b_trailing = 1;
-        int idx = 0;
+        i_level_code -= 15 << i_suffix_length;
+        if( i_suffix_length == 0 )
+            i_level_code -= 15;
 
-        /* level and run and total */
-        while( i_last >= 0 )
+        /* If the prefix size exceeds 15, High Profile is required. */
+        if( i_level_code >= 1<<12 )
         {
-            level[idx] = l[i_last--];
-
-            run[idx] = 0;
-            while( i_last >= 0 && l[i_last] == 0 )
+            if( h->sps->i_profile_idc >= PROFILE_HIGH )
             {
-                run[idx]++;
-                i_last--;
-            }
-
-            i_total++;
-            i_total_zero += run[idx];
-
-            if( b_trailing && abs( level[idx] ) == 1 && i_trailing < 3 )
-            {
-                i_sign <<= 1;
-                if( level[idx] < 0 )
+                while( i_level_code > 1<<(i_level_prefix-3) )
                 {
-                    i_sign |= 0x01;
+                    i_level_code -= 1<<(i_level_prefix-3);
+                    i_level_prefix++;
                 }
-
-                i_trailing++;
             }
             else
             {
-                b_trailing = 0;
+#if RDO_SKIP_BS
+                /* Weight highly against overflows. */
+                s->i_bits_encoded += 2000;
+#else
+                x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code );
+                /* clip level, preserving sign */
+                i_level_code = (1<<12) - 2 + (i_level_code & 1);
+#endif
             }
-
-            idx++;
         }
+        bs_write( s, i_level_prefix + 1, 1 );
+        bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
     }
+    if( i_suffix_length == 0 )
+        i_suffix_length++;
+    if( abs_level > next_suffix[i_suffix_length] )
+        i_suffix_length++;
+    return i_suffix_length;
+}
 
-    /* total/trailing */
-    if( i_idx == BLOCK_INDEX_CHROMA_DC )
-    {
-        bs_write_vlc( s, x264_coeff_token[4][i_total*4+i_trailing] );
-    }
-    else
-    {
-        /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
-        static const int ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3 };
-        int nC;
+static int block_residual_write_cavlc_internal( x264_t *h, int ctx_block_cat, dctcoef *l, int nC )
+{
+    bs_t *s = &h->out.bs;
+    static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
+    static const uint8_t count_cat[5] = {16, 15, 16, 4, 15};
+    x264_run_level_t runlevel;
+    int i_trailing, i_total_zero, i_suffix_length;
+    int i_total = 0;
+    unsigned int i_sign;
 
-        if( i_idx == BLOCK_INDEX_LUMA_DC )
-        {
-            nC = x264_mb_predict_non_zero_code( h, 0 );
-        }
-        else
-        {
-            nC = x264_mb_predict_non_zero_code( h, i_idx );
-        }
+    /* level and run and total */
+    /* set these to 2 to allow branchless i_trailing calculation */
+    runlevel.level[1] = 2;
+    runlevel.level[2] = 2;
+    i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
+    i_total_zero = runlevel.last + 1 - i_total;
+
+    i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
+               | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
+               | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
+    i_trailing = ctz_index[i_trailing];
+    i_sign = ((runlevel.level[2] >> 31) & 1)
+           | ((runlevel.level[1] >> 31) & 2)
+           | ((runlevel.level[0] >> 31) & 4);
+    i_sign >>= 3-i_trailing;
 
-        bs_write_vlc( s, x264_coeff_token[ct_index[nC]][i_total*4+i_trailing] );
-    }
+    /* total/trailing */
+    bs_write_vlc( s, x264_coeff_token[nC][i_total-1][i_trailing] );
 
-    if( i_total <= 0 )
-    {
-        return;
-    }
+    i_suffix_length = i_total > 10 && i_trailing < 3;
+    bs_write( s, i_trailing, i_sign );
 
-    i_suffix_length = i_total > 10 && i_trailing < 3 ? 1 : 0;
-    if( i_trailing > 0 )
+    if( i_trailing < i_total )
     {
-        bs_write( s, i_trailing, i_sign );
-    }
-    for( i = i_trailing; i < i_total; i++ )
-    {
-        int i_level_code;
+        int val = runlevel.level[i_trailing];
+        int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
+        val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
+        val += LEVEL_TABLE_SIZE/2;
 
-        /* calculate level code */
-        if( level[i] < 0 )
-        {
-            i_level_code = -2*level[i] - 1;
-        }
-        else /* if( level[i] > 0 ) */
-        {
-            i_level_code = 2 * level[i] - 2;
-        }
-        if( i == i_trailing && i_trailing < 3 )
-        {
-            i_level_code -=2; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
-        }
-
-        if( ( i_level_code >> i_suffix_length ) < 14 )
-        {
-            bs_write_vlc( s, x264_level_prefix[i_level_code >> i_suffix_length] );
-            if( i_suffix_length > 0 )
-            {
-                bs_write( s, i_suffix_length, i_level_code );
-            }
-        }
-        else if( i_suffix_length == 0 && i_level_code < 30 )
-        {
-            bs_write_vlc( s, x264_level_prefix[14] );
-            bs_write( s, 4, i_level_code - 14 );
-        }
-        else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
+        if( (unsigned)val_original < LEVEL_TABLE_SIZE )
         {
-            bs_write_vlc( s, x264_level_prefix[14] );
-            bs_write( s, i_suffix_length, i_level_code );
+            bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
+            i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next;
         }
         else
+            i_suffix_length = block_residual_write_cavlc_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
+        for( int i = i_trailing+1; i < i_total; i++ )
         {
-            bs_write_vlc( s, x264_level_prefix[15] );
-            i_level_code -= 15 << i_suffix_length;
-            if( i_suffix_length == 0 )
+            val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
+            if( (unsigned)val < LEVEL_TABLE_SIZE )
             {
-                i_level_code -= 15;
+                bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
+                i_suffix_length = x264_level_token[i_suffix_length][val].i_next;
             }
-
-            if( i_level_code >= ( 1 << 12 ) || i_level_code < 0 )
-            {
-                x264_log(h, X264_LOG_ERROR, "OVERFLOW levelcode=%d\n", i_level_code );
-            }
-
-            bs_write( s, 12, i_level_code );    /* check overflow ?? */
-        }
-
-        if( i_suffix_length == 0 )
-        {
-            i_suffix_length++;
-        }
-        if( abs( level[i] ) > ( 3 << ( i_suffix_length - 1 ) ) && i_suffix_length < 6 )
-        {
-            i_suffix_length++;
+            else
+                i_suffix_length = block_residual_write_cavlc_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
         }
     }
 
-    if( i_total < i_count )
+    if( (uint8_t)i_total < count_cat[ctx_block_cat] )
     {
-        if( i_idx == BLOCK_INDEX_CHROMA_DC )
-        {
+        if( ctx_block_cat == DCT_CHROMA_DC )
             bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
-        }
         else
-        {
             bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
-        }
     }
 
-    for( i = 0, i_zero_left = i_total_zero; i < i_total - 1; i++ )
+    for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
     {
-        int i_zl;
+        int i_zl = X264_MIN( i_total_zero, 7 );
+        bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
+        i_total_zero -= runlevel.run[i];
+    }
 
-        if( i_zero_left <= 0 )
-        {
-            break;
-        }
+    return i_total;
+}
+
+static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
 
-        i_zl = X264_MIN( i_zero_left - 1, 6 );
+#define block_residual_write_cavlc(h,cat,idx,l)\
+{\
+    int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? 0 : idx )];\
+    uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
+    if( !*nnz )\
+        bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
+    else\
+        *nnz = block_residual_write_cavlc_internal(h,cat,l,nC);\
+}
 
-        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
+static void cavlc_qp_delta( x264_t *h )
+{
+    bs_t *s = &h->out.bs;
+    int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
 
-        i_zero_left -= run[i];
+    /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
+    if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
+        && !h->mb.cache.non_zero_count[x264_scan8[24]] )
+    {
+#if !RDO_SKIP_BS
+        h->mb.i_qp = h->mb.i_last_qp;
+#endif
+        i_dqp = 0;
+    }
+
+    if( i_dqp )
+    {
+        if( i_dqp < -(QP_MAX+1)/2 )
+            i_dqp += QP_MAX+1;
+        else if( i_dqp > QP_MAX/2 )
+            i_dqp -= QP_MAX+1;
     }
+    bs_write_se( s, i_dqp );
 }
 
-static void x264_sub_mb_mv_write_cavlc( x264_t *h, bs_t *s, int i_list )
+static void cavlc_mb_mvd( x264_t *h, int i_list, int idx, int width )
 {
-    int i;
-    for( i = 0; i < 4; i++ )
-    {
-        int mvp[2];
+    bs_t *s = &h->out.bs;
+    ALIGNED_4( int16_t mvp[2] );
+    x264_mb_predict_mv( h, i_list, idx, width, mvp );
+    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
+    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
+}
 
-        if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-        {
-            continue;
-        }
+static inline void cavlc_mb8x8_mvd( x264_t *h, int i )
+{
+    switch( h->mb.i_sub_partition[i] )
+    {
+        case D_L0_8x8:
+            cavlc_mb_mvd( h, 0, 4*i, 2 );
+            break;
+        case D_L0_8x4:
+            cavlc_mb_mvd( h, 0, 4*i+0, 2 );
+            cavlc_mb_mvd( h, 0, 4*i+2, 2 );
+            break;
+        case D_L0_4x8:
+            cavlc_mb_mvd( h, 0, 4*i+0, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+1, 1 );
+            break;
+        case D_L0_4x4:
+            cavlc_mb_mvd( h, 0, 4*i+0, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+1, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+2, 1 );
+            cavlc_mb_mvd( h, 0, 4*i+3, 1 );
+            break;
+    }
+}
 
-        switch( h->mb.i_sub_partition[i] )
-        {
-            case D_L0_8x8:
-            case D_L1_8x8:
-            case D_BI_8x8:
-                x264_mb_predict_mv( h, i_list, 4*i, 2, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
-                break;
-            case D_L0_8x4:
-            case D_L1_8x4:
-            case D_BI_8x4:
-                x264_mb_predict_mv( h, i_list, 4*i+0, 2, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+2, 2, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][1] - mvp[1] );
-                break;
-            case D_L0_4x8:
-            case D_L1_4x8:
-            case D_BI_4x8:
-                x264_mb_predict_mv( h, i_list, 4*i+0, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+1, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][1] - mvp[1] );
-                break;
-            case D_L0_4x4:
-            case D_L1_4x4:
-            case D_BI_4x4:
-                x264_mb_predict_mv( h, i_list, 4*i+0, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+1, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+2, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+3, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+3]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+3]][1] - mvp[1] );
-                break;
-        }
+static inline void x264_macroblock_luma_write_cavlc( x264_t *h, int i8start, int i8end )
+{
+    if( h->mb.b_transform_8x8 )
+    {
+        /* shuffle 8x8 dct coeffs into 4x4 lists */
+        for( int i8 = i8start; i8 <= i8end; i8++ )
+            if( h->mb.i_cbp_luma & (1 << i8) )
+                h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
     }
+
+    for( int i8 = i8start; i8 <= i8end; i8++ )
+        if( h->mb.i_cbp_luma & (1 << i8) )
+            for( int i4 = 0; i4 < 4; i4++ )
+                block_residual_write_cavlc( h, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4] );
 }
 
 /*****************************************************************************
  * x264_macroblock_write:
  *****************************************************************************/
-void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
+void x264_macroblock_write_cavlc( x264_t *h )
 {
+    bs_t *s = &h->out.bs;
     const int i_mb_type = h->mb.i_type;
+    static const uint8_t i_offsets[3] = {5,23,0};
+    int i_mb_i_offset = i_offsets[h->sh.i_type];
+
+#if RDO_SKIP_BS
+    s->i_bits_encoded = 0;
+#else
     const int i_mb_pos_start = bs_pos( s );
     int       i_mb_pos_tex;
-    int i_mb_i_offset;
-    int i;
+#endif
 
-    switch( h->sh.i_type )
+    if( h->sh.b_mbaff
+        && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
     {
-        case SLICE_TYPE_I:
-            i_mb_i_offset = 0;
-            break;
-        case SLICE_TYPE_P:
-            i_mb_i_offset = 5;
-            break;
-        case SLICE_TYPE_B:
-            i_mb_i_offset = 23;
-            break;
-        default:
-            x264_log(h, X264_LOG_ERROR, "internal error or slice unsupported\n" );
-            return;
+        bs_write1( s, h->mb.b_interlaced );
     }
 
-    /* Write:
-      - type
-      - prediction
-      - mv */
+#if !RDO_SKIP_BS
     if( i_mb_type == I_PCM )
     {
-        /* Untested */
+        uint8_t *p_start = s->p_start;
         bs_write_ue( s, i_mb_i_offset + 25 );
+        i_mb_pos_tex = bs_pos( s );
+        h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
 
         bs_align_0( s );
-        /* Luma */
-        for( i = 0; i < 16*16; i++ )
-        {
-            const int x = 16 * h->mb.i_mb_x + (i % 16);
-            const int y = 16 * h->mb.i_mb_y + (i / 16);
-            bs_write( s, 8, h->fenc->plane[0][y*h->mb.pic.i_stride[0]+x] );
-        }
-        /* Cb */
-        for( i = 0; i < 8*8; i++ )
-        {
-            const int x = 8 * h->mb.i_mb_x + (i % 8);
-            const int y = 8 * h->mb.i_mb_y + (i / 8);
-            bs_write( s, 8, h->fenc->plane[1][y*h->mb.pic.i_stride[1]+x] );
-        }
-        /* Cr */
-        for( i = 0; i < 8*8; i++ )
-        {
-            const int x = 8 * h->mb.i_mb_x + (i % 8);
-            const int y = 8 * h->mb.i_mb_y + (i / 8);
-            bs_write( s, 8, h->fenc->plane[2][y*h->mb.pic.i_stride[2]+x] );
-        }
+
+        for( int i = 0; i < 256; i++ )
+            bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] );
+        for( int ch = 1; ch < 3; ch++ )
+            for( int i = 0; i < 8; i++ )
+                for( int j = 0; j < 8; j++ )
+                    bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
+
+        bs_init( s, s->p, s->p_end - s->p );
+        s->p_start = p_start;
+
+        h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
         return;
     }
-    else if( i_mb_type == I_4x4 )
+#endif
+
+    /* Write:
+      - type
+      - prediction
+      - mv */
+    if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
     {
+        int di = i_mb_type == I_8x8 ? 4 : 1;
         bs_write_ue( s, i_mb_i_offset + 0 );
+        if( h->pps->b_transform_8x8_mode )
+            bs_write1( s, h->mb.b_transform_8x8 );
 
         /* Prediction: Luma */
-        for( i = 0; i < 16; i++ )
+        for( int i = 0; i < 16; i += di )
         {
             int i_pred = x264_mb_predict_intra4x4_mode( h, i );
-            int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+            int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] );
 
-            if( i_pred == i_mode)
-            {
+            if( i_pred == i_mode )
                 bs_write1( s, 1 );  /* b_prev_intra4x4_pred_mode */
-            }
             else
-            {
-                bs_write1( s, 0 );  /* b_prev_intra4x4_pred_mode */
-                if( i_mode < i_pred )
-                {
-                    bs_write( s, 3, i_mode );
-                }
-                else
-                {
-                    bs_write( s, 3, i_mode - 1 );
-                }
-            }
+                bs_write( s, 4, i_mode - (i_mode > i_pred) );
         }
-        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+        bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
     }
     else if( i_mb_type == I_16x16 )
     {
-        bs_write_ue( s, i_mb_i_offset + 1 + h->mb.i_intra16x16_pred_mode +
+        bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
                         h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
-        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+        bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
     }
     else if( i_mb_type == P_L0 )
     {
-        int mvp[2];
-
         if( h->mb.i_partition == D_16x16 )
         {
-            bs_write_ue( s, 0 );
+            bs_write1( s, 1 );
 
-            if( h->sh.i_num_ref_idx_l0_active > 1 )
-            {
-                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-            }
-            x264_mb_predict_mv( h, 0, 0, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
+            if( h->mb.pic.i_fref[0] > 1 )
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            cavlc_mb_mvd( h, 0, 0, 4 );
         }
         else if( h->mb.i_partition == D_16x8 )
         {
             bs_write_ue( s, 1 );
-            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            if( h->mb.pic.i_fref[0] > 1 )
             {
-                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
             }
-
-            x264_mb_predict_mv( h, 0, 0, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
-
-            x264_mb_predict_mv( h, 0, 8, 4, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[8]][1] - mvp[1] );
+            cavlc_mb_mvd( h, 0, 0, 4 );
+            cavlc_mb_mvd( h, 0, 8, 4 );
         }
         else if( h->mb.i_partition == D_8x16 )
         {
             bs_write_ue( s, 2 );
-            if( h->sh.i_num_ref_idx_l0_active > 1 )
+            if( h->mb.pic.i_fref[0] > 1 )
             {
-                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+                bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
             }
-
-            x264_mb_predict_mv( h, 0, 0, 2, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[0]][1] - mvp[1] );
-
-            x264_mb_predict_mv( h, 0, 4, 2, mvp );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][0] - mvp[0] );
-            bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4]][1] - mvp[1] );
+            cavlc_mb_mvd( h, 0, 0, 2 );
+            cavlc_mb_mvd( h, 0, 4, 2 );
         }
     }
     else if( i_mb_type == P_8x8 )
     {
-        int b_sub_ref0;
-
-        if( h->mb.cache.ref[0][x264_scan8[0]] == 0 && h->mb.cache.ref[0][x264_scan8[4]] == 0 &&
-            h->mb.cache.ref[0][x264_scan8[8]] == 0 && h->mb.cache.ref[0][x264_scan8[12]] == 0 )
+        int b_sub_ref;
+        if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] |
+             h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 )
         {
             bs_write_ue( s, 4 );
-            b_sub_ref0 = 0;
+            b_sub_ref = 0;
         }
         else
         {
             bs_write_ue( s, 3 );
-            b_sub_ref0 = 1;
+            b_sub_ref = 1;
         }
+
         /* sub mb type */
-        for( i = 0; i < 4; i++ )
-        {
-            bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
-        }
+        if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
+            for( int i = 0; i < 4; i++ )
+                bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
+        else
+            bs_write( s, 4, 0xf );
+
         /* ref0 */
-        if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
+        if( b_sub_ref )
         {
-            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[0]] );
-            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4]] );
-            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
-            bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[12]] );
+            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] );
+            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] );
+            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] );
+            bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] );
         }
 
-        x264_sub_mb_mv_write_cavlc( h, s, 0 );
+        for( int i = 0; i < 4; i++ )
+            cavlc_mb8x8_mvd( h, i );
     }
     else if( i_mb_type == B_8x8 )
     {
         bs_write_ue( s, 22 );
 
         /* sub mb type */
-        for( i = 0; i < 4; i++ )
-        {
+        for( int i = 0; i < 4; i++ )
             bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
-        }
+
         /* ref */
-        for( i = 0; i < 4; i++ )
-        {
+        if( h->mb.pic.i_fref[0] > 1 )
+            for( int i = 0; i < 4; i++ )
+                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                    bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
+        if( h->mb.pic.i_fref[1] > 1 )
+            for( int i = 0; i < 4; i++ )
+                if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+                    bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
+
+        /* mvd */
+        for( int i = 0; i < 4; i++ )
             if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
-            {
-                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
-            }
-        }
-        for( i = 0; i < 4; i++ )
-        {
+                cavlc_mb_mvd( h, 0, 4*i, 2 );
+        for( int i = 0; i < 4; i++ )
             if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
-            {
-                bs_write_te( s, h->sh.i_num_ref_idx_l1_active - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
-            }
-        }
-        /* mvd */
-        x264_sub_mb_mv_write_cavlc( h, s, 0 );
-        x264_sub_mb_mv_write_cavlc( h, s, 1 );
+                cavlc_mb_mvd( h, 1, 4*i, 2 );
     }
     else if( i_mb_type != B_DIRECT )
     {
         /* All B mode */
         /* Motion Vector */
-        int i_list;
-        int mvp[2];
-
-        int b_list[2][2];
+        const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type];
+        const int i_ref0_max = h->mb.pic.i_fref[0] - 1;
+        const int i_ref1_max = h->mb.pic.i_fref[1] - 1;
 
-        /* init ref list utilisations */
-        for( i = 0; i < 2; i++ )
+        bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
+        if( h->mb.i_partition == D_16x16 )
         {
-            b_list[0][i] = x264_mb_type_list0_table[i_mb_type][i];
-            b_list[1][i] = x264_mb_type_list1_table[i_mb_type][i];
+            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] );
+            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] );
+            if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
+            if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
         }
-
-
-        bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] );
-
-        for( i_list = 0; i_list < 2; i_list++ )
+        else
         {
-            const int i_ref_max = i_list == 0 ? h->sh.i_num_ref_idx_l0_active : h->sh.i_num_ref_idx_l1_active;
-
-            if( i_ref_max > 1 )
+            if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] );
+            if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] );
+            if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] );
+            if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] );
+            if( h->mb.i_partition == D_16x8 )
             {
-                switch( h->mb.i_partition )
-                {
-                    case D_16x16:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        break;
-                    case D_16x8:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[8]] );
-                        break;
-                    case D_8x16:
-                        if( b_list[i_list][0] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[0]] );
-                        if( b_list[i_list][1] ) bs_write_te( s, i_ref_max - 1, h->mb.cache.ref[i_list][x264_scan8[4]] );
-                        break;
-                }
+                if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 4 );
+                if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 8, 4 );
+                if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 4 );
+                if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 8, 4 );
             }
-        }
-        for( i_list = 0; i_list < 2; i_list++ )
-        {
-            switch( h->mb.i_partition )
+            else //if( h->mb.i_partition == D_8x16 )
             {
-                case D_16x16:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    break;
-                case D_16x8:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    if( b_list[i_list][1] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 8, 4, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[8]][1] - mvp[1] );
-                    }
-                    break;
-                case D_8x16:
-                    if( b_list[i_list][0] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 0, 2, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[0]][1] - mvp[1] );
-                    }
-                    if( b_list[i_list][1] )
-                    {
-                        x264_mb_predict_mv( h, i_list, 4, 2, mvp );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][0] - mvp[0] );
-                        bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4]][1] - mvp[1] );
-                    }
-                    break;
+                if( b_list[0][0] ) cavlc_mb_mvd( h, 0, 0, 2 );
+                if( b_list[0][1] ) cavlc_mb_mvd( h, 0, 4, 2 );
+                if( b_list[1][0] ) cavlc_mb_mvd( h, 1, 0, 2 );
+                if( b_list[1][1] ) cavlc_mb_mvd( h, 1, 4, 2 );
             }
         }
     }
-    else if( i_mb_type == B_DIRECT )
-    {
-        bs_write_ue( s, 0 );
-    }
-    else
-    {
-        x264_log(h, X264_LOG_ERROR, "invalid/unhandled mb_type\n" );
-        return;
-    }
+    else //if( i_mb_type == B_DIRECT )
+        bs_write1( s, 1 );
 
+#if !RDO_SKIP_BS
     i_mb_pos_tex = bs_pos( s );
-    h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start;
+    h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
+#endif
 
     /* Coded block patern */
-    if( i_mb_type == I_4x4 )
-    {
+    if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
         bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
-    }
     else if( i_mb_type != I_16x16 )
-    {
         bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
-    }
+
+    /* transform size 8x8 flag */
+    if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma )
+        bs_write1( s, h->mb.b_transform_8x8 );
 
     /* write residual */
     if( i_mb_type == I_16x16 )
     {
-        bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
+        cavlc_qp_delta( h );
 
         /* DC Luma */
-        block_residual_write_cavlc( h, s, BLOCK_INDEX_LUMA_DC , h->dct.luma16x16_dc, 16 );
+        block_residual_write_cavlc( h, DCT_LUMA_DC, 24 , h->dct.luma16x16_dc );
 
-        if( h->mb.i_cbp_luma != 0 )
-        {
-            /* AC Luma */
-            for( i = 0; i < 16; i++ )
-            {
-                block_residual_write_cavlc( h, s, i, h->dct.block[i].residual_ac, 15 );
-            }
-        }
+        /* AC Luma */
+        if( h->mb.i_cbp_luma )
+            for( int i = 0; i < 16; i++ )
+                block_residual_write_cavlc( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 );
     }
-    else if( h->mb.i_cbp_luma != 0 || h->mb.i_cbp_chroma != 0 )
+    else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma )
     {
-        bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp );
-
-        for( i = 0; i < 16; i++ )
-        {
-            if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
-            {
-                block_residual_write_cavlc( h, s, i, h->dct.block[i].luma4x4, 16 );
-            }
-        }
+        cavlc_qp_delta( h );
+        x264_macroblock_luma_write_cavlc( h, 0, 3 );
     }
-    if( h->mb.i_cbp_chroma != 0 )
+    if( h->mb.i_cbp_chroma )
     {
         /* Chroma DC residual present */
-        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[0], 4 );
-        block_residual_write_cavlc( h, s, BLOCK_INDEX_CHROMA_DC, h->dct.chroma_dc[1], 4 );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
         if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
-        {
-            for( i = 0; i < 8; i++ )
-            {
-                block_residual_write_cavlc( h, s, 16 + i, h->dct.block[16+i].residual_ac, 15 );
-            }
-        }
+            for( int i = 16; i < 24; i++ )
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
     }
 
-    if( IS_INTRA( i_mb_type ) )
-        h->stat.frame.i_itex_bits += bs_pos(s) - i_mb_pos_tex;
+#if !RDO_SKIP_BS
+    h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex;
+#endif
+}
+
+#if RDO_SKIP_BS
+/*****************************************************************************
+ * RD only; doesn't generate a valid bitstream
+ * doesn't write cbp or chroma dc (I don't know how much this matters)
+ * doesn't write ref (never varies between calls, so no point in doing so)
+ * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO
+ * works on all partition sizes except 16x16
+ *****************************************************************************/
+static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
+{
+    bs_t *s = &h->out.bs;
+    const int i_mb_type = h->mb.i_type;
+    int b_8x16 = h->mb.i_partition == D_8x16;
+    int j;
+
+    if( i_mb_type == P_8x8 )
+    {
+        cavlc_mb8x8_mvd( h, i8 );
+        bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
+    }
+    else if( i_mb_type == P_L0 )
+        cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
+    else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 )
+    {
+        if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mb_mvd( h, 0, 4*i8, 4>>b_8x16 );
+        if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mb_mvd( h, 1, 4*i8, 4>>b_8x16 );
+    }
+    else //if( i_mb_type == B_8x8 )
+    {
+        if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+            cavlc_mb_mvd( h, 0, 4*i8, 2 );
+        if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+            cavlc_mb_mvd( h, 1, 4*i8, 2 );
+    }
+
+    for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
+    {
+        x264_macroblock_luma_write_cavlc( h, i8, i8 );
+        block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+        block_residual_write_cavlc( h, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1 );
+        i8 += x264_pixel_size[i_pixel].h >> 3;
+    }
+
+    return h->out.bs.i_bits_encoded;
+}
+
+static int x264_subpartition_size_cavlc( x264_t *h, int i4, int i_pixel )
+{
+    int b_8x4 = i_pixel == PIXEL_8x4;
+    h->out.bs.i_bits_encoded = 0;
+    cavlc_mb_mvd( h, 0, i4, 1+b_8x4 );
+    block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+    if( i_pixel != PIXEL_4x4 )
+    {
+        i4 += 2-b_8x4;
+        block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+    }
+
+    return h->out.bs.i_bits_encoded;
+}
+
+static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
+{
+    if( x264_mb_predict_intra4x4_mode( h, i4 ) == x264_mb_pred_mode4x4_fix( i_mode ) )
+        return 1;
     else
-        h->stat.frame.i_ptex_bits += bs_pos(s) - i_mb_pos_tex;
+        return 4;
+}
+
+static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
+{
+    h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
+    bs_write_ue( &h->out.bs, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] );
+    x264_macroblock_luma_write_cavlc( h, i8, i8 );
+    return h->out.bs.i_bits_encoded;
+}
+
+static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
+{
+    h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
+    block_residual_write_cavlc( h, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4] );
+    return h->out.bs.i_bits_encoded;
+}
+
+static int x264_i8x8_chroma_size_cavlc( x264_t *h )
+{
+    h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+    if( h->mb.i_cbp_chroma )
+    {
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0] );
+        block_residual_write_cavlc( h, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1] );
+
+        if( h->mb.i_cbp_chroma == 2 )
+        {
+            for( int i = 16; i < 24; i++ )
+                block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+        }
+    }
+    return h->out.bs.i_bits_encoded;
 }
+#endif