X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=encoder%2Fcabac.c;h=ebb66147e06c700a64ac9dd44ddb4da2b05642d6;hb=9f422c0cd9c0abcd6a7abb10b51f8be883c39b2b;hp=a2220c66f740cd3400d1256ed8c150a6b0a9c87d;hpb=f5af5f14e5d924a3b57d6bfbd1219a334771727b;p=x264 diff --git a/encoder/cabac.c b/encoder/cabac.c index a2220c66..ebb66147 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -1,7 +1,7 @@ /***************************************************************************** - * cabac.c: h264 encoder library + * cabac.c: cabac bitstream writing ***************************************************************************** - * Copyright (C) 2003-2008 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar * Loren Merritt @@ -20,6 +20,9 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" @@ -63,118 +66,21 @@ static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_ } } -static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb ) +#if !RDO_SKIP_BS +static void x264_cabac_field_decoding_flag( x264_t *h, x264_cabac_t *cb ) { - const int i_mb_type = h->mb.i_type; - - if( h->sh.b_mbaff && - (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) - { - x264_cabac_encode_decision_noup( cb, 70 + h->mb.cache.i_neighbour_interlaced, h->mb.b_interlaced ); - } - - if( h->sh.i_type == SLICE_TYPE_I ) - { - int ctx = 0; - if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != I_4x4 ) - ctx++; - if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != I_4x4 ) - ctx++; - - x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 ); - } - else if( h->sh.i_type == SLICE_TYPE_P ) - { - /* prefix: 14, suffix: 17 */ - if( i_mb_type == P_L0 ) - { - x264_cabac_encode_decision_noup( cb, 14, 0 ); - x264_cabac_encode_decision_noup( cb, 15, h->mb.i_partition != D_16x16 ); - x264_cabac_encode_decision_noup( cb, 17-(h->mb.i_partition == D_16x16), h->mb.i_partition == D_16x8 ); - } - else if( i_mb_type == P_8x8 ) - { - x264_cabac_encode_decision_noup( cb, 14, 0 ); - x264_cabac_encode_decision_noup( cb, 15, 0 ); - x264_cabac_encode_decision_noup( cb, 16, 1 ); - } - else /* intra */ - { - /* prefix */ - x264_cabac_encode_decision_noup( cb, 14, 1 ); - - /* suffix */ - x264_cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 ); - } - } - else //if( h->sh.i_type == SLICE_TYPE_B ) - { - int ctx = 0; - if( h->mb.i_mb_type_left >= 0 && h->mb.i_mb_type_left != B_SKIP && h->mb.i_mb_type_left != B_DIRECT ) - ctx++; - if( h->mb.i_mb_type_top >= 0 && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT ) - ctx++; - - if( i_mb_type == B_DIRECT ) - { - x264_cabac_encode_decision_noup( cb, 27+ctx, 0 ); - return; - } - x264_cabac_encode_decision_noup( cb, 27+ctx, 1 ); + int ctx = 0; + ctx += h->mb.field_decoding_flag & !!h->mb.i_mb_x; + ctx += (h->mb.i_mb_top_mbpair_xy >= 0 + && h->mb.slice_table[h->mb.i_mb_top_mbpair_xy] == h->sh.i_first_mb + && h->mb.field[h->mb.i_mb_top_mbpair_xy]); - if( i_mb_type == B_8x8 ) - { - x264_cabac_encode_decision_noup( cb, 27+3, 1 ); - x264_cabac_encode_decision_noup( cb, 27+4, 1 ); - x264_cabac_encode_decision( cb, 27+5, 1 ); - x264_cabac_encode_decision( cb, 27+5, 1 ); - x264_cabac_encode_decision_noup( cb, 27+5, 1 ); - } - else if( IS_INTRA( i_mb_type ) ) - { - /* prefix */ - x264_cabac_encode_decision_noup( cb, 27+3, 1 ); - x264_cabac_encode_decision_noup( cb, 27+4, 1 ); - x264_cabac_encode_decision( cb, 27+5, 1 ); - x264_cabac_encode_decision( cb, 27+5, 0 ); - x264_cabac_encode_decision( cb, 27+5, 1 ); - - /* suffix */ - x264_cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 ); - } - else - { - static const uint8_t i_mb_bits[9*3] = - { - 0x31, 0x29, 0x4, /* L0 L0 */ - 0x35, 0x2d, 0, /* L0 L1 */ - 0x43, 0x63, 0, /* L0 BI */ - 0x3d, 0x2f, 0, /* L1 L0 */ - 0x39, 0x25, 0x6, /* L1 L1 */ - 0x53, 0x73, 0, /* L1 BI */ - 0x4b, 0x6b, 0, /* BI L0 */ - 0x5b, 0x7b, 0, /* BI L1 */ - 0x47, 0x67, 0x21 /* BI BI */ - }; - - const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8); - int bits = i_mb_bits[idx]; - - x264_cabac_encode_decision_noup( cb, 27+3, bits&1 ); - x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2; - if( bits != 1 ) - { - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; - x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; - if( bits != 1 ) - x264_cabac_encode_decision_noup( cb, 27+5, bits&1 ); - } - } - } + x264_cabac_encode_decision_noup( cb, 70 + ctx, MB_INTERLACED ); + h->mb.field_decoding_flag = MB_INTERLACED; } +#endif -static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode ) +static void x264_cabac_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode ) { if( i_pred == i_mode ) x264_cabac_encode_decision( cb, 68, 1 ); @@ -189,13 +95,13 @@ static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int } } -static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb ) +static void x264_cabac_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb ) { - const int i_mode = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ]; - int ctx = 0; + int i_mode = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]; + int ctx = 0; /* No need to test for I4x4 or I_16x16 as cache_save handle that */ - if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_xy - 1] != 0 ) + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 ) ctx++; @@ -209,7 +115,7 @@ static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb ) } } -static void x264_cabac_mb_cbp_luma( x264_t *h, x264_cabac_t *cb ) +static void x264_cabac_cbp_luma( x264_t *h, x264_cabac_t *cb ) { int cbp = h->mb.i_cbp_luma; int cbp_l = h->mb.cache.i_cbp_left; @@ -220,7 +126,7 @@ static void x264_cabac_mb_cbp_luma( x264_t *h, x264_cabac_t *cb ) x264_cabac_encode_decision_noup( cb, 76 - ((cbp >> 2) & 1) - ((cbp >> 0) & 2), (cbp >> 3) & 1 ); } -static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb ) +static void x264_cabac_cbp_chroma( x264_t *h, x264_cabac_t *cb ) { int cbp_a = h->mb.cache.i_cbp_left & 0x30; int cbp_b = h->mb.cache.i_cbp_top & 0x30; @@ -237,17 +143,19 @@ static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb ) ctx = 4; if( cbp_a == 0x20 ) ctx++; if( cbp_b == 0x20 ) ctx += 2; - x264_cabac_encode_decision_noup( cb, 77 + ctx, h->mb.i_cbp_chroma > 1 ); + x264_cabac_encode_decision_noup( cb, 77 + ctx, h->mb.i_cbp_chroma >> 1 ); } } -static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb ) +static void x264_cabac_qp_delta( x264_t *h, x264_cabac_t *cb ) { int i_dqp = h->mb.i_qp - h->mb.i_last_qp; int ctx; - /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */ - if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] ) + /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely + * flat background area. Don't do this if it would raise the quantizer, since that could + * cause unexpected deblocking artifacts. */ + if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] && h->mb.i_qp > h->mb.i_last_qp ) { #if !RDO_SKIP_BS h->mb.i_qp = h->mb.i_last_qp; @@ -255,16 +163,19 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb ) i_dqp = 0; } - /* Since, per the above, empty-CBP I16x16 blocks never have delta quants, - * we don't have to check for them. */ - ctx = h->mb.i_last_dqp && h->mb.cbp[h->mb.i_mb_prev_xy]; + ctx = h->mb.i_last_dqp && (h->mb.type[h->mb.i_mb_prev_xy] == I_16x16 || (h->mb.cbp[h->mb.i_mb_prev_xy]&0x3f)); if( i_dqp != 0 ) { - int val = i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp - 1); - /* dqp is interpreted modulo 52 */ - if( val >= 51 && val != 52 ) - val = 103 - val; + /* Faster than (i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp-1)). + * If you so much as sneeze on these lines, gcc will compile this suboptimally. */ + i_dqp *= 2; + int val = 1 - i_dqp; + if( val < 0 ) val = i_dqp; + val--; + /* dqp is interpreted modulo (QP_MAX_SPEC+1) */ + if( val >= QP_MAX_SPEC && val != QP_MAX_SPEC+1 ) + val = 2*QP_MAX_SPEC+1 - val; do { x264_cabac_encode_decision( cb, 60 + ctx, 1 ); @@ -277,14 +188,14 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb ) #if !RDO_SKIP_BS void x264_cabac_mb_skip( x264_t *h, int b_skip ) { - int ctx = (h->mb.i_mb_type_left >= 0 && !IS_SKIP( h->mb.i_mb_type_left )) - + (h->mb.i_mb_type_top >= 0 && !IS_SKIP( h->mb.i_mb_type_top )) - + (h->sh.i_type == SLICE_TYPE_P ? 11 : 24); + int ctx = h->mb.cache.i_neighbour_skip + 11; + if( h->sh.i_type != SLICE_TYPE_P ) + ctx += 13; x264_cabac_encode_decision( &h->cabac, ctx, b_skip ); } #endif -static inline void x264_cabac_mb_sub_p_partition( x264_cabac_t *cb, int i_sub ) +static inline void x264_cabac_subpartition_p( x264_cabac_t *cb, int i_sub ) { if( i_sub == D_L0_8x8 ) { @@ -301,7 +212,7 @@ static inline void x264_cabac_mb_sub_p_partition( x264_cabac_t *cb, int i_sub ) } } -static inline void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub ) +static ALWAYS_INLINE void x264_cabac_subpartition_b( x264_cabac_t *cb, int i_sub ) { if( i_sub == D_DIRECT_8x8 ) { @@ -321,204 +232,429 @@ static inline void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub ) x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 ); } -static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb ) +static ALWAYS_INLINE void x264_cabac_transform_size( x264_t *h, x264_cabac_t *cb ) { int ctx = 399 + h->mb.cache.i_neighbour_transform_size; x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 ); } -static void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx ) +static ALWAYS_INLINE void x264_cabac_ref_internal( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int bframe ) { const int i8 = x264_scan8[idx]; const int i_refa = h->mb.cache.ref[i_list][i8 - 1]; const int i_refb = h->mb.cache.ref[i_list][i8 - 8]; - int i_ref = h->mb.cache.ref[i_list][i8]; - int ctx = 0; + int ctx = 0; - if( i_refa > 0 && !h->mb.cache.skip[i8 - 1] ) + if( i_refa > 0 && (!bframe || !h->mb.cache.skip[i8 - 1]) ) ctx++; - if( i_refb > 0 && !h->mb.cache.skip[i8 - 8] ) + if( i_refb > 0 && (!bframe || !h->mb.cache.skip[i8 - 8]) ) ctx += 2; - while( i_ref > 0 ) + for( int i_ref = h->mb.cache.ref[i_list][i8]; i_ref > 0; i_ref-- ) { x264_cabac_encode_decision( cb, 54 + ctx, 1 ); ctx = (ctx>>2)+4; - i_ref--; } x264_cabac_encode_decision( cb, 54 + ctx, 0 ); } -static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx ) +static NOINLINE void x264_cabac_ref_p( x264_t *h, x264_cabac_t *cb, int idx ) { - const int i_abs = abs( mvd ); - const int ctxbase = l ? 47 : 40; - int i; -#if RDO_SKIP_BS - if( i_abs == 0 ) + x264_cabac_ref_internal( h, cb, 0, idx, 0 ); +} +static NOINLINE void x264_cabac_ref_b( x264_t *h, x264_cabac_t *cb, int i_list, int idx ) +{ + x264_cabac_ref_internal( h, cb, i_list, idx, 1 ); +} + +static ALWAYS_INLINE int x264_cabac_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx ) +{ + int ctxbase = l ? 47 : 40; + + if( mvd == 0 ) + { x264_cabac_encode_decision( cb, ctxbase + ctx, 0 ); + return 0; + } + + int i_abs = abs( mvd ); + x264_cabac_encode_decision( cb, ctxbase + ctx, 1 ); +#if RDO_SKIP_BS + if( i_abs <= 3 ) + { + for( int i = 1; i < i_abs; i++ ) + x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 ); + x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 ); + x264_cabac_encode_bypass( cb, mvd >> 31 ); + } else { - x264_cabac_encode_decision( cb, ctxbase + ctx, 1 ); - if( i_abs <= 3 ) + x264_cabac_encode_decision( cb, ctxbase + 3, 1 ); + x264_cabac_encode_decision( cb, ctxbase + 4, 1 ); + x264_cabac_encode_decision( cb, ctxbase + 5, 1 ); + if( i_abs < 9 ) { - for( i = 1; i < i_abs; i++ ) - x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 ); - x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 ); - x264_cabac_encode_bypass( cb, mvd < 0 ); + cb->f8_bits_encoded += x264_cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]]; + cb->state[ctxbase+6] = x264_cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]]; } else { - x264_cabac_encode_decision( cb, ctxbase + 3, 1 ); - x264_cabac_encode_decision( cb, ctxbase + 4, 1 ); - x264_cabac_encode_decision( cb, ctxbase + 5, 1 ); - if( i_abs < 9 ) - { - cb->f8_bits_encoded += cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]]; - cb->state[ctxbase+6] = cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]]; - } - else - { - cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]]; - cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]]; - x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 ); - } + cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]]; + cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]]; + x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 ); } } #else static const uint8_t ctxes[8] = { 3,4,5,6,6,6,6,6 }; - if( i_abs == 0 ) - x264_cabac_encode_decision( cb, ctxbase + ctx, 0 ); + if( i_abs < 9 ) + { + for( int i = 1; i < i_abs; i++ ) + x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 ); + x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs-1], 0 ); + } else { - x264_cabac_encode_decision( cb, ctxbase + ctx, 1 ); - if( i_abs < 9 ) - { - for( i = 1; i < i_abs; i++ ) - x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 ); - x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs-1], 0 ); - } - else - { - for( i = 1; i < 9; i++ ) - x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 ); - x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 ); - } - x264_cabac_encode_bypass( cb, mvd < 0 ); + for( int i = 1; i < 9; i++ ) + x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 ); + x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 ); } + x264_cabac_encode_bypass( cb, mvd >> 31 ); #endif + /* Since we don't need to keep track of MVDs larger than 66, just cap the value. + * This lets us store MVDs as 8-bit values instead of 16-bit. */ + return X264_MIN( i_abs, 66 ); } -static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width ) +static NOINLINE uint16_t x264_cabac_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width ) { ALIGNED_4( int16_t mvp[2] ); - uint32_t amvd; int mdx, mdy; /* Calculate mvd */ x264_mb_predict_mv( h, i_list, idx, width, mvp ); mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0]; mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1]; - amvd = x264_cabac_amvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1], - h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]); + uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1], + h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]); /* encode */ - x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFFFF ); - x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>16 ); + mdx = x264_cabac_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF ); + mdy = x264_cabac_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 ); - return pack16to32_mask(mdx,mdy); + return pack8to16(mdx,mdy); } -#define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\ +#define x264_cabac_mvd(h,cb,i_list,idx,width,height)\ do\ {\ - uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width);\ + uint16_t mvd = x264_cabac_mvd(h,cb,i_list,idx,width);\ x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\ } while(0) -static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i ) +static inline void x264_cabac_8x8_mvd( x264_t *h, x264_cabac_t *cb, int i ) { switch( h->mb.i_sub_partition[i] ) { case D_L0_8x8: - x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 ); + x264_cabac_mvd( h, cb, 0, 4*i, 2, 2 ); break; case D_L0_8x4: - x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 2, 1 ); - x264_cabac_mb_mvd( h, cb, 0, 4*i+2, 2, 1 ); + x264_cabac_mvd( h, cb, 0, 4*i+0, 2, 1 ); + x264_cabac_mvd( h, cb, 0, 4*i+2, 2, 1 ); break; case D_L0_4x8: - x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 1, 2 ); - x264_cabac_mb_mvd( h, cb, 0, 4*i+1, 1, 2 ); + x264_cabac_mvd( h, cb, 0, 4*i+0, 1, 2 ); + x264_cabac_mvd( h, cb, 0, 4*i+1, 1, 2 ); break; case D_L0_4x4: - x264_cabac_mb_mvd( h, cb, 0, 4*i+0, 1, 1 ); - x264_cabac_mb_mvd( h, cb, 0, 4*i+1, 1, 1 ); - x264_cabac_mb_mvd( h, cb, 0, 4*i+2, 1, 1 ); - x264_cabac_mb_mvd( h, cb, 0, 4*i+3, 1, 1 ); + x264_cabac_mvd( h, cb, 0, 4*i+0, 1, 1 ); + x264_cabac_mvd( h, cb, 0, 4*i+1, 1, 1 ); + x264_cabac_mvd( h, cb, 0, 4*i+2, 1, 1 ); + x264_cabac_mvd( h, cb, 0, 4*i+3, 1, 1 ); break; default: assert(0); } } -/* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0 - * 1-> AC 16x16 i_idx = luma4x4idx - * 2-> Luma4x4 i_idx = luma4x4idx - * 3-> DC Chroma i_idx = iCbCr - * 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx - * 5-> Luma8x8 i_idx = luma8x8idx - */ +static ALWAYS_INLINE void x264_cabac_mb_header_i( x264_t *h, x264_cabac_t *cb, int i_mb_type, int slice_type, int chroma ) +{ + if( slice_type == SLICE_TYPE_I ) + { + int ctx = 0; + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != I_4x4 ) + ctx++; + if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != I_4x4 ) + ctx++; + + x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 ); + } + else if( slice_type == SLICE_TYPE_P ) + { + /* prefix */ + x264_cabac_encode_decision_noup( cb, 14, 1 ); + + /* suffix */ + x264_cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 ); + } + else if( slice_type == SLICE_TYPE_B ) + { + /* prefix */ + x264_cabac_encode_decision_noup( cb, 27+3, 1 ); + x264_cabac_encode_decision_noup( cb, 27+4, 1 ); + x264_cabac_encode_decision( cb, 27+5, 1 ); + x264_cabac_encode_decision( cb, 27+5, 0 ); + x264_cabac_encode_decision( cb, 27+5, 1 ); + + /* suffix */ + x264_cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 ); + } + + if( i_mb_type == I_PCM ) + return; + + if( i_mb_type != I_16x16 ) + { + if( h->pps->b_transform_8x8_mode ) + x264_cabac_transform_size( h, cb ); -static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra ) + int di = h->mb.b_transform_8x8 ? 4 : 1; + for( int i = 0; i < 16; i += di ) + { + const int i_pred = x264_mb_predict_intra4x4_mode( h, i ); + const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] ); + x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode ); + } + } + + if( chroma ) + x264_cabac_intra_chroma_pred_mode( h, cb ); +} + +static ALWAYS_INLINE void x264_cabac_mb_header_p( x264_t *h, x264_cabac_t *cb, int i_mb_type, int chroma ) { - int i_nza; - int i_nzb; + if( i_mb_type == P_L0 ) + { + x264_cabac_encode_decision_noup( cb, 14, 0 ); + if( h->mb.i_partition == D_16x16 ) + { + x264_cabac_encode_decision_noup( cb, 15, 0 ); + x264_cabac_encode_decision_noup( cb, 16, 0 ); + if( h->mb.pic.i_fref[0] > 1 ) + x264_cabac_ref_p( h, cb, 0 ); + x264_cabac_mvd( h, cb, 0, 0, 4, 4 ); + } + else if( h->mb.i_partition == D_16x8 ) + { + x264_cabac_encode_decision_noup( cb, 15, 1 ); + x264_cabac_encode_decision_noup( cb, 17, 1 ); + if( h->mb.pic.i_fref[0] > 1 ) + { + x264_cabac_ref_p( h, cb, 0 ); + x264_cabac_ref_p( h, cb, 8 ); + } + x264_cabac_mvd( h, cb, 0, 0, 4, 2 ); + x264_cabac_mvd( h, cb, 0, 8, 4, 2 ); + } + else //if( h->mb.i_partition == D_8x16 ) + { + x264_cabac_encode_decision_noup( cb, 15, 1 ); + x264_cabac_encode_decision_noup( cb, 17, 0 ); + if( h->mb.pic.i_fref[0] > 1 ) + { + x264_cabac_ref_p( h, cb, 0 ); + x264_cabac_ref_p( h, cb, 4 ); + } + x264_cabac_mvd( h, cb, 0, 0, 2, 4 ); + x264_cabac_mvd( h, cb, 0, 4, 2, 4 ); + } + } + else if( i_mb_type == P_8x8 ) + { + x264_cabac_encode_decision_noup( cb, 14, 0 ); + x264_cabac_encode_decision_noup( cb, 15, 0 ); + x264_cabac_encode_decision_noup( cb, 16, 1 ); - switch( i_cat ) + /* sub mb type */ + for( int i = 0; i < 4; i++ ) + x264_cabac_subpartition_p( cb, h->mb.i_sub_partition[i] ); + + /* ref 0 */ + if( h->mb.pic.i_fref[0] > 1 ) + { + x264_cabac_ref_p( h, cb, 0 ); + x264_cabac_ref_p( h, cb, 4 ); + x264_cabac_ref_p( h, cb, 8 ); + x264_cabac_ref_p( h, cb, 12 ); + } + + for( int i = 0; i < 4; i++ ) + x264_cabac_8x8_mvd( h, cb, i ); + } + else /* intra */ + x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_P, chroma ); +} + +static ALWAYS_INLINE void x264_cabac_mb_header_b( x264_t *h, x264_cabac_t *cb, int i_mb_type, int chroma ) +{ + int ctx = 0; + if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != B_SKIP && h->mb.i_mb_type_left[0] != B_DIRECT ) + ctx++; + if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT ) + ctx++; + + if( i_mb_type == B_DIRECT ) { - case DCT_LUMA_AC: - case DCT_LUMA_4x4: - case DCT_CHROMA_AC: - /* no need to test for skip/pcm */ - i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1]; - i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8]; - if( x264_constant_p(b_intra) && !b_intra ) - return 85 + 4*i_cat + ((2*i_nzb + i_nza)&0x7f); - else + x264_cabac_encode_decision_noup( cb, 27+ctx, 0 ); + return; + } + x264_cabac_encode_decision_noup( cb, 27+ctx, 1 ); + + if( i_mb_type == B_8x8 ) + { + x264_cabac_encode_decision_noup( cb, 27+3, 1 ); + x264_cabac_encode_decision_noup( cb, 27+4, 1 ); + x264_cabac_encode_decision( cb, 27+5, 1 ); + x264_cabac_encode_decision( cb, 27+5, 1 ); + x264_cabac_encode_decision_noup( cb, 27+5, 1 ); + + /* sub mb type */ + for( int i = 0; i < 4; i++ ) + x264_cabac_subpartition_b( cb, h->mb.i_sub_partition[i] ); + + /* ref */ + if( h->mb.pic.i_fref[0] > 1 ) + for( int i = 0; i < 4; i++ ) + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) + x264_cabac_ref_b( h, cb, 0, 4*i ); + + if( h->mb.pic.i_fref[1] > 1 ) + for( int i = 0; i < 4; i++ ) + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) + x264_cabac_ref_b( h, cb, 1, 4*i ); + + for( int i = 0; i < 4; i++ ) + if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) + x264_cabac_mvd( h, cb, 0, 4*i, 2, 2 ); + + for( int i = 0; i < 4; i++ ) + if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) + x264_cabac_mvd( h, cb, 1, 4*i, 2, 2 ); + } + else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI ) + { + /* All B modes */ + static const uint8_t i_mb_bits[9*3] = + { + 0x31, 0x29, 0x4, /* L0 L0 */ + 0x35, 0x2d, 0, /* L0 L1 */ + 0x43, 0x63, 0, /* L0 BI */ + 0x3d, 0x2f, 0, /* L1 L0 */ + 0x39, 0x25, 0x6, /* L1 L1 */ + 0x53, 0x73, 0, /* L1 BI */ + 0x4b, 0x6b, 0, /* BI L0 */ + 0x5b, 0x7b, 0, /* BI L1 */ + 0x47, 0x67, 0x21 /* BI BI */ + }; + + const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8); + int bits = i_mb_bits[idx]; + + x264_cabac_encode_decision_noup( cb, 27+3, bits&1 ); + x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2; + if( bits != 1 ) + { + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; + x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; + if( bits != 1 ) + x264_cabac_encode_decision_noup( cb, 27+5, bits&1 ); + } + + const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type]; + if( h->mb.pic.i_fref[0] > 1 ) + { + if( b_list[0][0] ) + x264_cabac_ref_b( h, cb, 0, 0 ); + if( b_list[0][1] && h->mb.i_partition != D_16x16 ) + x264_cabac_ref_b( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) ); + } + if( h->mb.pic.i_fref[1] > 1 ) + { + if( b_list[1][0] ) + x264_cabac_ref_b( h, cb, 1, 0 ); + if( b_list[1][1] && h->mb.i_partition != D_16x16 ) + x264_cabac_ref_b( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) ); + } + for( int i_list = 0; i_list < 2; i_list++ ) + { + if( h->mb.i_partition == D_16x16 ) { - i_nza &= 0x7f + (b_intra << 7); - i_nzb &= 0x7f + (b_intra << 7); - return 85 + 4*i_cat + 2*!!i_nzb + !!i_nza; + if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 4, 4 ); } - case DCT_LUMA_DC: - i_nza = (h->mb.cache.i_cbp_left >> 8) & 1; - i_nzb = (h->mb.cache.i_cbp_top >> 8) & 1; - return 85 + 4*i_cat + 2*i_nzb + i_nza; - case DCT_CHROMA_DC: - /* no need to test skip/pcm */ - i_idx -= 25; - i_nza = h->mb.cache.i_cbp_left != -1 ? (h->mb.cache.i_cbp_left >> (9 + i_idx)) & 1 : b_intra; - i_nzb = h->mb.cache.i_cbp_top != -1 ? (h->mb.cache.i_cbp_top >> (9 + i_idx)) & 1 : b_intra; - return 85 + 4*i_cat + 2*i_nzb + i_nza; - default: - return 0; + else if( h->mb.i_partition == D_16x8 ) + { + if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 4, 2 ); + if( b_list[i_list][1] ) x264_cabac_mvd( h, cb, i_list, 8, 4, 2 ); + } + else //if( h->mb.i_partition == D_8x16 ) + { + if( b_list[i_list][0] ) x264_cabac_mvd( h, cb, i_list, 0, 2, 4 ); + if( b_list[i_list][1] ) x264_cabac_mvd( h, cb, i_list, 4, 2, 4 ); + } + } } + else /* intra */ + x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_B, chroma ); } +static int ALWAYS_INLINE x264_cabac_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra, int b_dc ) +{ + static const uint16_t base_ctx[14] = {85,89,93,97,101,1012,460,464,468,1016,472,476,480,1020}; -static const uint16_t significant_coeff_flag_offset[2][6] = { - { 105, 120, 134, 149, 152, 402 }, - { 277, 292, 306, 321, 324, 436 } -}; -static const uint16_t last_coeff_flag_offset[2][6] = { - { 166, 181, 195, 210, 213, 417 }, - { 338, 353, 367, 382, 385, 451 } -}; -static const uint16_t coeff_abs_level_m1_offset[6] = - { 227, 237, 247, 257, 266, 426 }; -static const uint8_t significant_coeff_flag_offset_8x8[2][63] = + if( b_dc ) + { + i_idx -= LUMA_DC; + if( i_cat == DCT_CHROMA_DC ) + { + int i_nza = h->mb.cache.i_cbp_left != -1 ? (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1 : b_intra; + int i_nzb = h->mb.cache.i_cbp_top != -1 ? (h->mb.cache.i_cbp_top >> (8 + i_idx)) & 1 : b_intra; + return base_ctx[i_cat] + 2*i_nzb + i_nza; + } + else + { + int i_nza = (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1; + int i_nzb = (h->mb.cache.i_cbp_top >> (8 + i_idx)) & 1; + return base_ctx[i_cat] + 2*i_nzb + i_nza; + } + } + else + { + int i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1]; + int i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8]; + if( x264_constant_p(b_intra) && !b_intra ) + return base_ctx[i_cat] + ((2*i_nzb + i_nza)&0x7f); + else + { + i_nza &= 0x7f + (b_intra << 7); + i_nzb &= 0x7f + (b_intra << 7); + return base_ctx[i_cat] + 2*!!i_nzb + !!i_nza; + } + } +} + +#if !RDO_SKIP_BS +extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][64]; +extern const uint8_t x264_last_coeff_flag_offset_8x8[63]; +extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7]; +extern const uint16_t x264_significant_coeff_flag_offset[2][16]; +extern const uint16_t x264_last_coeff_flag_offset[2][16]; +extern const uint16_t x264_coeff_abs_level_m1_offset[16]; +extern const uint8_t x264_count_cat_m1[14]; +#else +/* Padded to [64] for easier addressing */ +const uint8_t x264_significant_coeff_flag_offset_8x8[2][64] = {{ 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, @@ -530,102 +666,121 @@ static const uint8_t significant_coeff_flag_offset_8x8[2][63] = 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }}; -static const uint8_t last_coeff_flag_offset_8x8[63] = { +const uint8_t x264_last_coeff_flag_offset_8x8[63] = +{ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 }; +const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */ +const uint16_t x264_significant_coeff_flag_offset[2][16] = +{ + { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718, 0, 0 }, + { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733, 0, 0 } +}; +const uint16_t x264_last_coeff_flag_offset[2][16] = +{ + { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748, 0, 0 }, + { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757, 0, 0 } +}; +const uint16_t x264_coeff_abs_level_m1_offset[16] = +{ + 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766 +}; +const uint8_t x264_count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63}; +#endif // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). // 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). /* map node ctx => cabac ctx for level=1 */ -static const int coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; /* map node ctx => cabac ctx for level>1 */ -static const int coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; +/* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that + * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */ +static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 }; + static const uint8_t coeff_abs_level_transition[2][8] = { /* update node ctx after coding a level=1 */ { 1, 2, 3, 3, 4, 5, 6, 7 }, /* update node ctx after coding a level>1 */ { 4, 4, 4, 4, 5, 6, 7, 7 } }; -static const int count_cat_m1[5] = {15, 14, 15, 3, 14}; #if !RDO_SKIP_BS -static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int16_t *l ) +static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc ) { - const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; - const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; - const int i_ctx_level = coeff_abs_level_m1_offset[i_ctxBlockCat]; - const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced]; - int i_coeff_abs_m1[64]; - int i_coeff_sign[64]; - int i_coeff = 0; - int i_last; - int node_ctx = 0; - int i = 0; - - i_last = h->quantf.coeff_last[i_ctxBlockCat](l); - -#define WRITE_SIGMAP( l8x8 )\ - while(1)\ + int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat]; + int coeff_idx = -1, node_ctx = 0; + int last = h->quantf.coeff_last[ctx_block_cat]( l ); + const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; + dctcoef coeffs[64]; + +#define WRITE_SIGMAP( sig_off, last_off )\ +{\ + int i = 0;\ + while( 1 )\ {\ if( l[i] )\ {\ - i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;\ - i_coeff_sign[i_coeff] = l[i] < 0;\ - i_coeff++;\ - x264_cabac_encode_decision( cb, i_ctx_sig + (l8x8 ? significant_coeff_flag_offset[i] : i), 1 );\ - if( i == i_last )\ + coeffs[++coeff_idx] = l[i];\ + x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );\ + if( i == last )\ {\ - x264_cabac_encode_decision( cb, i_ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\ + x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );\ break;\ }\ else\ - x264_cabac_encode_decision( cb, i_ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\ + x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );\ }\ else\ - x264_cabac_encode_decision( cb, i_ctx_sig + (l8x8 ? significant_coeff_flag_offset[i] : i), 0 );\ - i++;\ - if( i == i_count_m1 )\ + x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );\ + if( ++i == count_m1 )\ {\ - i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;\ - i_coeff_sign[i_coeff] = l[i] < 0;\ - i_coeff++;\ + coeffs[++coeff_idx] = l[i];\ break;\ }\ - } + }\ +} - if( i_ctxBlockCat == DCT_LUMA_8x8 ) + if( chroma422dc ) { - const int i_count_m1 = 63; - WRITE_SIGMAP( 1 ) + int count_m1 = 7; + WRITE_SIGMAP( x264_coeff_flag_offset_chroma_422_dc[i], x264_coeff_flag_offset_chroma_422_dc[i] ) } else { - const int i_count_m1 = count_cat_m1[i_ctxBlockCat]; - WRITE_SIGMAP( 0 ) + int count_m1 = x264_count_cat_m1[ctx_block_cat]; + if( count_m1 == 63 ) + { + const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; + WRITE_SIGMAP( sig_offset[i], x264_last_coeff_flag_offset_8x8[i] ) + } + else + WRITE_SIGMAP( i, i ) } do { - int i_prefix, ctx; - i_coeff--; - /* write coeff_abs - 1 */ - i_prefix = X264_MIN( i_coeff_abs_m1[i_coeff], 14 ); - ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level; + int coeff = coeffs[coeff_idx]; + int abs_coeff = abs(coeff); + int coeff_sign = coeff >> 31; + int ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; - if( i_prefix ) + if( abs_coeff > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; - for( i = 0; i < i_prefix - 1; i++ ) + ctx = levelgt1_ctx[node_ctx] + ctx_level; + for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- ) x264_cabac_encode_decision( cb, ctx, 1 ); - if( i_prefix < 14 ) + if( abs_coeff < 15 ) x264_cabac_encode_decision( cb, ctx, 0 ); else - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 ); + x264_cabac_encode_ue_bypass( cb, 0, abs_coeff - 15 ); node_ctx = coeff_abs_level_transition[1][node_ctx]; } @@ -635,50 +790,69 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl node_ctx = coeff_abs_level_transition[0][node_ctx]; } - x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] ); - } while( i_coeff > 0 ); + x264_cabac_encode_bypass( cb, coeff_sign ); + } while( --coeff_idx >= 0 ); } -#define block_residual_write_cabac_8x8( h, cb, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, l ) -#else - -/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct - * this is slightly incorrect because the sigmap is not reversible - * (contexts are repeated). However, there is nearly no quality penalty - * for this (~0.001db) and the speed boost (~30%) is worth it. */ -static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int16_t *l, int b_8x8 ) +void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; - const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; - const int i_ctx_level = coeff_abs_level_m1_offset[i_ctxBlockCat]; - const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced]; - int i_last, i_coeff_abs, ctx, i, node_ctx; - - i_last = h->quantf.coeff_last[i_ctxBlockCat](l); + x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 ); +} - i_coeff_abs = abs(l[i_last]); - ctx = coeff_abs_level1_ctx[0] + i_ctx_level; +static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_c( h, cb, ctx_block_cat, l ); +#endif +} +static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ + /* Template a version specifically for chroma 4:2:2 DC in order to avoid + * slowing down everything else due to the added complexity. */ + x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 1 ); +} +#define x264_cabac_block_residual_8x8( h, cb, cat, l ) x264_cabac_block_residual( h, cb, cat, l ) +#else - if( i_last != (b_8x8 ? 63 : count_cat_m1[i_ctxBlockCat]) ) +/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct and chroma 4:2:2 dc this is + * slightly incorrect because the sigmap is not reversible (contexts are repeated). However, there + * is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */ +static void ALWAYS_INLINE x264_cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc ) +{ + const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; + int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; + int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat]; + int last = h->quantf.coeff_last[ctx_block_cat]( l ); + int coeff_abs = abs(l[last]); + int ctx = coeff_abs_level1_ctx[0] + ctx_level; + int node_ctx; + const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; + + if( last != (b_8x8 ? 63 : chroma422dc ? 7 : x264_count_cat_m1[ctx_block_cat]) ) { - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i_last]:i_last), 1 ); - x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[last] : + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 ); } - if( i_coeff_abs > 1 ) + if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level; - if( i_coeff_abs < 15 ) + ctx = levelgt1_ctx[0] + ctx_level; + if( coeff_abs < 15 ) { - cb->f8_bits_encoded += cabac_size_unary[i_coeff_abs-1][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[i_coeff_abs-1][cb->state[ctx]]; + cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]]; + cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { - cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]]; - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs - 15 ); + cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]]; + cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]]; + x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][0]; } @@ -689,29 +863,31 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c x264_cabac_encode_bypass( cb, 0 ); // sign } - for( i = i_last-1 ; i >= 0; i-- ) + for( int i = last-1 ; i >= 0; i-- ) { if( l[i] ) { - i_coeff_abs = abs(l[i]); - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 1 ); - x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 ); - ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level; - - if( i_coeff_abs > 1 ) + coeff_abs = abs(l[i]); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[i] : + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 ); + ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; + + if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; - if( i_coeff_abs < 15 ) + ctx = levelgt1_ctx[node_ctx] + ctx_level; + if( coeff_abs < 15 ) { - cb->f8_bits_encoded += cabac_size_unary[i_coeff_abs-1][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[i_coeff_abs-1][cb->state[ctx]]; + cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]]; + cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { - cb->f8_bits_encoded += cabac_size_unary[14][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[14][cb->state[ctx]]; - x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs - 15 ); + cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]]; + cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]]; + x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][node_ctx]; } @@ -723,265 +899,247 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c } } else - x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 0 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : + chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 ); } } -static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int16_t *l ) +void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, l, 1 ); + x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 1, 0 ); } -static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int16_t *l ) +void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ + x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 ); +} + +static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l ); +#endif +} +static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ +#if ARCH_X86_64 && HAVE_MMX + h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); +#else + x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l ); +#endif +} + +static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, l, 0 ); + x264_cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 ); } #endif -#define block_residual_write_cabac_cbf( h, cb, i_ctxBlockCat, i_idx, l, b_intra ) \ -{ \ - int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \ +#define x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, b_dc, name )\ +do\ +{\ + int ctxidxinc = x264_cabac_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra, b_dc );\ if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\ {\ x264_cabac_encode_decision( cb, ctxidxinc, 1 );\ - block_residual_write_cabac( h, cb, i_ctxBlockCat, l ); \ + x264_cabac_block_residual##name( h, cb, ctx_block_cat, l );\ }\ else\ x264_cabac_encode_decision( cb, ctxidxinc, 0 );\ -} +} while(0) -void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) +#define x264_cabac_block_residual_dc_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ + x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 1, ) + +#define x264_cabac_block_residual_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ + x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 0, ) + +#define x264_cabac_block_residual_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ + x264_cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 0, _8x8 ) + +#define x264_cabac_block_residual_422_dc_cbf( h, cb, ch, b_intra )\ + x264_cabac_block_residual_cbf_internal( h, cb, DCT_CHROMA_DC, CHROMA_DC+(ch), h->dct.chroma_dc[ch], b_intra, 1, _422_dc ) + +static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma ) { const int i_mb_type = h->mb.i_type; - int i_list; - int i; #if !RDO_SKIP_BS const int i_mb_pos_start = x264_cabac_pos( cb ); int i_mb_pos_tex; + + if( SLICE_MBAFF && + (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) + { + x264_cabac_field_decoding_flag( h, cb ); + } #endif - /* Write the MB type */ - x264_cabac_mb_type( h, cb ); + if( h->sh.i_type == SLICE_TYPE_P ) + x264_cabac_mb_header_p( h, cb, i_mb_type, chroma ); + else if( h->sh.i_type == SLICE_TYPE_B ) + x264_cabac_mb_header_b( h, cb, i_mb_type, chroma ); + else //if( h->sh.i_type == SLICE_TYPE_I ) + x264_cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_I, chroma ); #if !RDO_SKIP_BS + i_mb_pos_tex = x264_cabac_pos( cb ); + h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; + if( i_mb_type == I_PCM ) { - i_mb_pos_tex = x264_cabac_pos( cb ); - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; - - memcpy( cb->p, h->mb.pic.p_fenc[0], 256 ); - cb->p += 256; - for( i = 0; i < 8; i++ ) - memcpy( cb->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 ); - cb->p += 64; - for( i = 0; i < 8; i++ ) - memcpy( cb->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 ); - cb->p += 64; - - cb->i_low = 0; - cb->i_range = 0x01FE; - cb->i_queue = -1; - cb->i_bytes_outstanding = 0; - - /* if PCM is chosen, we need to store reconstructed frame data */ - h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 ); - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 ); - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 ); + bs_t s; + bs_init( &s, cb->p, cb->p_end - cb->p ); + + for( int p = 0; p < plane_count; p++ ) + for( int i = 0; i < 256; i++ ) + bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] ); + if( chroma ) + for( int ch = 1; ch < 3; ch++ ) + for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ ) + for( int j = 0; j < 8; j++ ) + bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] ); + + bs_flush( &s ); + cb->p = s.p; + x264_cabac_encode_init_core( cb ); h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; return; } #endif - if( IS_INTRA( i_mb_type ) ) + if( i_mb_type != I_16x16 ) { - if( h->pps->b_transform_8x8_mode && i_mb_type != I_16x16 ) - x264_cabac_mb_transform_size( h, cb ); - - if( i_mb_type != I_16x16 ) - { - int di = h->mb.b_transform_8x8 ? 4 : 1; - for( i = 0; i < 16; i += di ) - { - const int i_pred = x264_mb_predict_intra4x4_mode( h, i ); - const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] ); - x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode ); - } - } + x264_cabac_cbp_luma( h, cb ); + if( chroma ) + x264_cabac_cbp_chroma( h, cb ); + } - x264_cabac_mb_intra_chroma_pred_mode( h, cb ); + if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma ) + { + x264_cabac_transform_size( h, cb ); } - else if( i_mb_type == P_L0 ) + + if( h->mb.i_cbp_luma || (chroma && h->mb.i_cbp_chroma) || i_mb_type == I_16x16 ) { - if( h->mb.i_partition == D_16x16 ) - { - if( h->mb.pic.i_fref[0] > 1 ) - { - x264_cabac_mb_ref( h, cb, 0, 0 ); - } - x264_cabac_mb_mvd( h, cb, 0, 0, 4, 4 ); - } - else if( h->mb.i_partition == D_16x8 ) + const int b_intra = IS_INTRA( i_mb_type ); + x264_cabac_qp_delta( h, cb ); + + /* write residual */ + if( i_mb_type == I_16x16 ) { - if( h->mb.pic.i_fref[0] > 1 ) + /* DC Luma */ + for( int p = 0; p < plane_count; p++ ) { - x264_cabac_mb_ref( h, cb, 0, 0 ); - x264_cabac_mb_ref( h, cb, 0, 8 ); + x264_cabac_block_residual_dc_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 ); + + /* AC Luma */ + if( h->mb.i_cbp_luma ) + for( int i = p*16; i < p*16+16; i++ ) + x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_AC][p], i, h->dct.luma4x4[i]+1, 1 ); } - x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 ); - x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 ); } - else //if( h->mb.i_partition == D_8x16 ) + else if( h->mb.b_transform_8x8 ) { - if( h->mb.pic.i_fref[0] > 1 ) + if( plane_count == 3 ) { - x264_cabac_mb_ref( h, cb, 0, 0 ); - x264_cabac_mb_ref( h, cb, 0, 4 ); - } - x264_cabac_mb_mvd( h, cb, 0, 0, 2, 4 ); - x264_cabac_mb_mvd( h, cb, 0, 4, 2, 4 ); - } - } - else if( i_mb_type == P_8x8 ) - { - /* sub mb type */ - for( i = 0; i < 4; i++ ) - x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i] ); + ALIGNED_4( uint8_t nnzbak[3][8] ); - /* ref 0 */ - if( h->mb.pic.i_fref[0] > 1 ) - { - x264_cabac_mb_ref( h, cb, 0, 0 ); - x264_cabac_mb_ref( h, cb, 0, 4 ); - x264_cabac_mb_ref( h, cb, 0, 8 ); - x264_cabac_mb_ref( h, cb, 0, 12 ); - } +/* Stupid nnz munging in the case that neighbors don't have + * 8x8 transform enabled. */ +#define BACKUP( dst, src, res )\ + dst = src;\ + src = res; - for( i = 0; i < 4; i++ ) - x264_cabac_mb8x8_mvd( h, cb, i ); - } - else if( i_mb_type == B_8x8 ) - { - /* sub mb type */ - for( i = 0; i < 4; i++ ) - x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i] ); +#define RESTORE( dst, src, res )\ + src = dst; - /* ref */ - if( h->mb.pic.i_fref[0] > 1 ) - for( i = 0; i < 4; i++ ) - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) - x264_cabac_mb_ref( h, cb, 0, 4*i ); +#define MUNGE_8x8_NNZ( MUNGE )\ +if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\ +{\ + MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\ + MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\ + MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\ + MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\ + MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\ + MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\ +}\ +if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\ +{\ + MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\ + MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\ + MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\ + MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\ + MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\ + MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\ +}\ +if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\ +{\ + MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\ + MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\ + MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\ +} - if( h->mb.pic.i_fref[1] > 1 ) - for( i = 0; i < 4; i++ ) - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) - x264_cabac_mb_ref( h, cb, 1, 4*i ); + MUNGE_8x8_NNZ( BACKUP ) - for( i = 0; i < 4; i++ ) - if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) - x264_cabac_mb_mvd( h, cb, 0, 4*i, 2, 2 ); + for( int p = 0; p < 3; p++ ) + FOREACH_BIT( i, 0, h->mb.i_cbp_luma ) + x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i*4+p*16, h->dct.luma8x8[i+p*4], b_intra ); - for( i = 0; i < 4; i++ ) - if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) - x264_cabac_mb_mvd( h, cb, 1, 4*i, 2, 2 ); - } - else if( i_mb_type != B_DIRECT ) - { - /* All B mode */ - const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type]; - if( h->mb.pic.i_fref[0] > 1 ) - { - if( b_list[0][0] ) - x264_cabac_mb_ref( h, cb, 0, 0 ); - if( b_list[0][1] && h->mb.i_partition != D_16x16 ) - x264_cabac_mb_ref( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) ); + MUNGE_8x8_NNZ( RESTORE ) + } + else + { + FOREACH_BIT( i, 0, h->mb.i_cbp_luma ) + x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i] ); + } } - if( h->mb.pic.i_fref[1] > 1 ) + else { - if( b_list[1][0] ) - x264_cabac_mb_ref( h, cb, 1, 0 ); - if( b_list[1][1] && h->mb.i_partition != D_16x16 ) - x264_cabac_mb_ref( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) ); + for( int p = 0; p < plane_count; p++ ) + FOREACH_BIT( i8x8, 0, h->mb.i_cbp_luma ) + for( int i = 0; i < 4; i++ ) + x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+i8x8*4+p*16, h->dct.luma4x4[i+i8x8*4+p*16], b_intra ); } - for( i_list = 0; i_list < 2; i_list++ ) + + if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */ { - if( h->mb.i_partition == D_16x16 ) + if( CHROMA_FORMAT == CHROMA_422 ) { - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 4 ); + x264_cabac_block_residual_422_dc_cbf( h, cb, 0, b_intra ); + x264_cabac_block_residual_422_dc_cbf( h, cb, 1, b_intra ); } - else if( h->mb.i_partition == D_16x8 ) + else { - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 ); - if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 ); + x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra ); + x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra ); } - else //if( h->mb.i_partition == D_8x16 ) + + if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */ { - if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 ); - if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 ); + int step = 8 << CHROMA_V_SHIFT; + for( int i = 16; i < 3*16; i += step ) + for( int j = i; j < i+4; j++ ) + x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra ); } } } -#if !RDO_SKIP_BS - i_mb_pos_tex = x264_cabac_pos( cb ); - h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; -#endif - - if( i_mb_type != I_16x16 ) - { - x264_cabac_mb_cbp_luma( h, cb ); - x264_cabac_mb_cbp_chroma( h, cb ); - } - - if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma ) - { - x264_cabac_mb_transform_size( h, cb ); - } - - if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 ) - { - const int b_intra = IS_INTRA( i_mb_type ); - x264_cabac_mb_qp_delta( h, cb ); - - /* write residual */ - if( i_mb_type == I_16x16 ) - { - /* DC Luma */ - block_residual_write_cabac_cbf( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 1 ); - - /* AC Luma */ - if( h->mb.i_cbp_luma != 0 ) - for( i = 0; i < 16; i++ ) - block_residual_write_cabac_cbf( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 1 ); - } - else if( h->mb.b_transform_8x8 ) - { - for( i = 0; i < 4; i++ ) - if( h->mb.i_cbp_luma & ( 1 << i ) ) - block_residual_write_cabac_8x8( h, cb, h->dct.luma8x8[i] ); - } - else - { - for( i = 0; i < 16; i++ ) - if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) ) - block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], b_intra ); - } - - if( h->mb.i_cbp_chroma&0x03 ) /* Chroma DC residual present */ - { - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], b_intra ); - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], b_intra ); - if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */ - for( i = 16; i < 24; i++ ) - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra ); - } - } - #if !RDO_SKIP_BS h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; #endif } +void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) +{ + if( CHROMA444 ) + x264_macroblock_write_cabac_internal( h, cb, 3, 0 ); + else + x264_macroblock_write_cabac_internal( h, cb, 1, 1 ); +} + #if RDO_SKIP_BS /***************************************************************************** * RD only; doesn't generate a valid bitstream @@ -994,44 +1152,62 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int { const int i_mb_type = h->mb.i_type; int b_8x16 = h->mb.i_partition == D_8x16; - int j; + int plane_count = CHROMA444 ? 3 : 1; if( i_mb_type == P_8x8 ) { - x264_cabac_mb8x8_mvd( h, cb, i8 ); - x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] ); + x264_cabac_8x8_mvd( h, cb, i8 ); + x264_cabac_subpartition_p( cb, h->mb.i_sub_partition[i8] ); } else if( i_mb_type == P_L0 ) - x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<>b_8x16, 2< B_DIRECT && i_mb_type < B_8x8 ) { - if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<>b_8x16, 2<>b_8x16, 2<>b_8x16, 2<mb.i_sub_partition[i8] ] ) - x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 2 ); + x264_cabac_mvd( h, cb, 0, 4*i8, 2, 2 ); if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] ) - x264_cabac_mb_mvd( h, cb, 1, 4*i8, 2, 2 ); + x264_cabac_mvd( h, cb, 1, 4*i8, 2, 2 ); } - for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- ) + for( int j = (i_pixel < PIXEL_8x8); j >= 0; j-- ) { if( h->mb.i_cbp_luma & (1 << i8) ) { if( h->mb.b_transform_8x8 ) - block_residual_write_cabac_8x8( h, cb, h->dct.luma8x8[i8] ); - else { - int i4; - for( i4 = 0; i4 < 4; i4++ ) - block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 0 ); + if( CHROMA444 ) + for( int p = 0; p < 3; p++ ) + x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 0 ); + else + x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] ); } + else + for( int p = 0; p < plane_count; p++ ) + for( int i4 = 0; i4 < 4; i4++ ) + x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16], 0 ); } - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 ); - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 0 ); + if( h->mb.i_cbp_chroma ) + { + if( CHROMA_FORMAT == CHROMA_422 ) + { + int offset = (5*i8) & 0x09; + x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1, 0 ); + x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1, 0 ); + x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1, 0 ); + x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1, 0 ); + } + else + { + x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 ); + x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 ); + } + } i8 += x264_pixel_size[i_pixel].h >> 3; } @@ -1040,15 +1216,16 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel ) { int b_8x4 = i_pixel == PIXEL_8x4; - block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 0 ); + int plane_count = CHROMA444 ? 3 : 1; if( i_pixel == PIXEL_4x4 ) - { - x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 ); - } + x264_cabac_mvd( h, cb, 0, i4, 1, 1 ); else + x264_cabac_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 ); + for( int p = 0; p < plane_count; p++ ) { - x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 ); - block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 0 ); + x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4, h->dct.luma4x4[p*16+i4], 0 ); + if( i_pixel != PIXEL_4x4 ) + x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4], 0 ); } } @@ -1056,34 +1233,51 @@ static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, { const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 ); i_mode = x264_mb_pred_mode4x4_fix( i_mode ); - x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode ); - x264_cabac_mb_cbp_luma( h, cb ); + x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode ); + x264_cabac_cbp_luma( h, cb ); if( h->mb.i_cbp_luma & (1 << i8) ) - block_residual_write_cabac_8x8( h, cb, h->dct.luma8x8[i8] ); + { + if( CHROMA444 ) + for( int p = 0; p < 3; p++ ) + x264_cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 1 ); + else + x264_cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] ); + } } static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode ) { const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 ); + int plane_count = CHROMA444 ? 3 : 1; i_mode = x264_mb_pred_mode4x4_fix( i_mode ); - x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode ); - block_residual_write_cabac_cbf( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 1 ); + x264_cabac_intra4x4_pred_mode( cb, i_pred, i_mode ); + for( int p = 0; p < plane_count; p++ ) + x264_cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 ); } -static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb ) +static void x264_chroma_size_cabac( x264_t *h, x264_cabac_t *cb ) { - x264_cabac_mb_intra_chroma_pred_mode( h, cb ); - x264_cabac_mb_cbp_chroma( h, cb ); - if( h->mb.i_cbp_chroma > 0 ) + x264_cabac_intra_chroma_pred_mode( h, cb ); + x264_cabac_cbp_chroma( h, cb ); + if( h->mb.i_cbp_chroma ) { - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 1 ); - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 1 ); + if( CHROMA_FORMAT == CHROMA_422 ) + { + x264_cabac_block_residual_422_dc_cbf( h, cb, 0, 1 ); + x264_cabac_block_residual_422_dc_cbf( h, cb, 1, 1 ); + } + else + { + x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 ); + x264_cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 ); + } if( h->mb.i_cbp_chroma == 2 ) { - int i; - for( i = 16; i < 24; i++ ) - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 1 ); + int step = 8 << CHROMA_V_SHIFT; + for( int i = 16; i < 3*16; i += step ) + for( int j = i; j < i+4; j++ ) + x264_cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 ); } } }