From: Fiona Glaser Date: Sat, 21 Aug 2010 07:15:53 +0000 (-0700) Subject: CAVLC "trellis" X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=268618932b0c065c1ab1eea26311f35937073c58;p=x264 CAVLC "trellis" ~3-10% improved compression with CAVLC. --trellis is now a valid option with CAVLC. Perhaps more importantly, this means psy-trellis now works with CAVLC. This isn't a real trellis; it's actually just a simplified QNS. But it takes enough shortcuts that it's still roughly as fast as a trellis; just not quite optimal. Thus the name is a bit of a misnomer, but we're reusing the option name because it does the same thing. A real trellis would be better, but CAVLC is much harder to trellis than CABAC. I'm not aware of any published polynomial-time solutions that are significantly close to optimal. --- diff --git a/encoder/cavlc.c b/encoder/cavlc.c index f510aeb2..e28c723f 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -95,7 +95,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_len { #if RDO_SKIP_BS /* Weight highly against overflows. */ - s->i_bits_encoded += 1000000; + s->i_bits_encoded += 2000; #else x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code ); /* clip level, preserving sign */ @@ -113,7 +113,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_len return i_suffix_length; } -static int block_residual_write_cavlc( x264_t *h, int i_ctxBlockCat, dctcoef *l, int nC ) +static int block_residual_write_cavlc_internal( x264_t *h, int i_ctxBlockCat, dctcoef *l, int nC ) { bs_t *s = &h->out.bs; static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0}; @@ -199,7 +199,7 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3}; if( !*nnz )\ bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\ else\ - *nnz = block_residual_write_cavlc(h,cat,l,nC);\ + *nnz = block_residual_write_cavlc_internal(h,cat,l,nC);\ } static void cavlc_qp_delta( x264_t *h ) diff --git a/encoder/encoder.c b/encoder/encoder.c index e9225dbf..ea55cb1e 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -683,8 +683,6 @@ static int x264_validate_parameters( x264_t *h ) h->param.analyse.intra &= ~X264_ANALYSE_I8x8; } h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); - if( !h->param.b_cabac ) - h->param.analyse.i_trellis = 0; h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); if( !h->param.analyse.b_psy ) { diff --git a/encoder/macroblock.c b/encoder/macroblock.c index fa47c8bd..4c906c1e 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -739,7 +739,7 @@ void x264_macroblock_encode( x264_t *h ) else if( h->mb.b_transform_8x8 ) { ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] ); - b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation + b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] ); h->nr_count[1] += h->mb.b_noise_reduction * 4; diff --git a/encoder/rdo.c b/encoder/rdo.c index eba3901b..a8a20ff8 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -410,10 +410,12 @@ typedef struct { // comparable to the input. so unquant is the direct inverse of quant, // and uses the dct scaling factors, not the idct ones. -static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct, - const uint16_t *quant_mf, const int *unquant_mf, - const int *coef_weight, const uint8_t *zigzag, - int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx ) +static ALWAYS_INLINE +int quant_trellis_cabac( x264_t *h, dctcoef *dct, + const uint16_t *quant_mf, const int *unquant_mf, + const int *coef_weight, const uint8_t *zigzag, + int i_ctxBlockCat, int i_lambda2, int b_ac, + int dc, int i_coefs, int idx ) { int abs_coefs[64], signs[64]; trellis_node_t nodes[2][8]; @@ -629,35 +631,262 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct, return 1; } +/* FIXME: This is a gigantic hack. See below. + * + * CAVLC is much more difficult to trellis than CABAC. + * + * CABAC has only three states to track: significance map, last, and the + * level state machine. + * CAVLC, by comparison, has five: coeff_token (trailing + total), + * total_zeroes, zero_run, and the level state machine. + * + * I know of no paper that has managed to design a close-to-optimal trellis + * that covers all five of these and isn't exponential-time. As a result, this + * "trellis" isn't: it's just a QNS search. Patches welcome for something better. + * It's actually surprisingly fast, albeit not quite optimal. It's pretty close + * though; since CAVLC only has 2^16 possible rounding modes (assuming only two + * roundings as options), a bruteforce search is feasible. Testing shows + * that this QNS is reasonably close to optimal in terms of compression. + * + * TODO: + * Don't bother changing large coefficients when it wouldn't affect bit cost + * (e.g. only affecting bypassed suffix bits). + * Don't re-run all parts of CAVLC bit cost calculation when not necessary. + * e.g. when changing a coefficient from one non-zero value to another in + * such a way that trailing ones and suffix length isn't affected. */ +static ALWAYS_INLINE +int quant_trellis_cavlc( x264_t *h, dctcoef *dct, + const uint16_t *quant_mf, const int *unquant_mf, + const int *coef_weight, const uint8_t *zigzag, + int i_ctxBlockCat, int i_lambda2, int b_ac, + int dc, int i_coefs, int idx, int b_8x8 ) +{ + ALIGNED_16( dctcoef quant_coefs[2][16] ); + ALIGNED_16( dctcoef coefs[16] ) = {0}; + int delta_distortion[16]; + int64_t score = 1ULL<<62; + int i, j; + const int f = 1<<15; + int nC = i_ctxBlockCat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, i_ctxBlockCat == DCT_LUMA_DC ? 0 : idx )]; + + /* Code for handling 8x8dct -> 4x4dct CAVLC munging. Input/output use a different + * step/start/end than internal processing. */ + int step = 1; + int start = b_ac; + int end = i_coefs - 1; + if( b_8x8 ) + { + start = idx&3; + end = 60 + start; + step = 4; + } + + i_lambda2 <<= LAMBDA_BITS; + + /* Find last non-zero coefficient. */ + for( i = end; i >= start; i -= step ) + if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f ) + break; + + if( i < start ) + goto zeroblock; + + /* Prepare for QNS search: calculate distortion caused by each DCT coefficient + * rounding to be searched. + * + * We only search two roundings (nearest and nearest-1) like in CABAC trellis, + * so we just store the difference in distortion between them. */ + int i_last_nnz = b_8x8 ? i >> 2 : i; + int coef_mask = 0; + int round_mask = 0; + for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step ) + { + int coef = dct[zigzag[j]]; + int abs_coef = abs(coef); + int sign = coef < 0 ? -1 : 1; + int nearest_quant = ( f + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16; + quant_coefs[1][i] = quant_coefs[0][i] = sign * nearest_quant; + coefs[i] = quant_coefs[1][i]; + if( nearest_quant ) + { + /* We initialize the trellis with a deadzone halfway between nearest rounding + * and always-round-down. This gives much better results than initializing to either + * extreme. + * FIXME: should we initialize to the deadzones used by deadzone quant? */ + int deadzone_quant = ( f/2 + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16; + int unquant1 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-0) + 128) >> 8); + int unquant0 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-1) + 128) >> 8); + int d1 = abs_coef - unquant1; + int d0 = abs_coef - unquant0; + delta_distortion[i] = (d0*d0 - d1*d1) * (dc?256:coef_weight[j]); + + /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */ + if( h->mb.i_psy_trellis && j && !dc && i_ctxBlockCat != DCT_CHROMA_AC ) + { + int orig_coef = b_8x8 ? h->mb.pic.fenc_dct8[idx>>2][zigzag[j]] : h->mb.pic.fenc_dct4[idx][zigzag[j]]; + int predicted_coef = orig_coef - coef; + int psy_weight = b_8x8 ? x264_dct8_weight_tab[zigzag[j]] : x264_dct4_weight_tab[zigzag[j]]; + int psy_value0 = h->mb.i_psy_trellis * abs(predicted_coef + unquant0 * sign); + int psy_value1 = h->mb.i_psy_trellis * abs(predicted_coef + unquant1 * sign); + delta_distortion[i] += (psy_value0 - psy_value1) * psy_weight; + } + + quant_coefs[0][i] = sign * (nearest_quant-1); + if( deadzone_quant != nearest_quant ) + coefs[i] = quant_coefs[0][i]; + else + round_mask |= 1 << i; + } + else + delta_distortion[i] = 0; + coef_mask |= (!!coefs[i]) << i; + } + + /* Calculate the cost of the starting state. */ + h->out.bs.i_bits_encoded = 0; + if( !coef_mask ) + bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] ); + else + block_residual_write_cavlc_internal( h, i_ctxBlockCat, coefs + b_ac, nC ); + score = (int64_t)h->out.bs.i_bits_encoded * i_lambda2; + + /* QNS loop: pick the change that improves RD the most, apply it, repeat. + * coef_mask and round_mask are used to simplify tracking of nonzeroness + * and rounding modes chosen. */ + while( 1 ) + { + int64_t iter_score = score; + int iter_distortion_delta = 0; + int iter_coef = -1; + int iter_mask = coef_mask; + int iter_round = round_mask; + for( i = b_ac; i <= i_last_nnz; i++ ) + { + if( !delta_distortion[i] ) + continue; + + /* Set up all the variables for this iteration. */ + int cur_round = round_mask ^ (1 << i); + int round_change = (cur_round >> i)&1; + int old_coef = coefs[i]; + int new_coef = quant_coefs[round_change][i]; + int cur_mask = (coef_mask&~(1 << i))|(!!new_coef << i); + int cur_distortion_delta = delta_distortion[i] * (round_change ? -1 : 1); + int64_t cur_score = cur_distortion_delta; + coefs[i] = new_coef; + + /* Count up bits. */ + h->out.bs.i_bits_encoded = 0; + if( !cur_mask ) + bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] ); + else + block_residual_write_cavlc_internal( h, i_ctxBlockCat, coefs + b_ac, nC ); + cur_score += (int64_t)h->out.bs.i_bits_encoded * i_lambda2; + + coefs[i] = old_coef; + if( cur_score < iter_score ) + { + iter_score = cur_score; + iter_coef = i; + iter_mask = cur_mask; + iter_round = cur_round; + iter_distortion_delta = cur_distortion_delta; + } + } + if( iter_coef >= 0 ) + { + score = iter_score - iter_distortion_delta; + coef_mask = iter_mask; + round_mask = iter_round; + coefs[iter_coef] = quant_coefs[((round_mask >> iter_coef)&1)][iter_coef]; + /* Don't try adjusting coefficients we've already adjusted. + * Testing suggests this doesn't hurt results -- and sometimes actually helps. */ + delta_distortion[iter_coef] = 0; + } + else + break; + } + + if( coef_mask ) + { + for( i = b_ac, j = start; i <= i_last_nnz; i++, j += step ) + dct[zigzag[j]] = coefs[i]; + for( ; j <= end; j += step ) + dct[zigzag[j]] = 0; + return 1; + } + +zeroblock: + if( !dc ) + { + if( b_8x8 ) + for( i = start; i <= end; i+=step ) + dct[zigzag[i]] = 0; + else + memset( dct, 0, 16*sizeof(dctcoef) ); + } + return 0; +} + const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3}; int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma ) { - return quant_trellis_cabac( h, dct, + if( h->param.b_cabac ) + return quant_trellis_cabac( h, dct, + h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], + NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced], + i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 ); + + return quant_trellis_cavlc( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced], - i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 ); + i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0, 0 ); } int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra, int b_chroma, int idx ) { int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC); - return quant_trellis_cabac( h, dct, - h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - x264_dct4_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan4[h->mb.b_interlaced], - i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx ); + if( h->param.b_cabac ) + return quant_trellis_cabac( h, dct, + h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], + x264_dct4_weight2_zigzag[h->mb.b_interlaced], + x264_zigzag_scan4[h->mb.b_interlaced], + i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx ); + + return quant_trellis_cavlc( h, dct, + h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], + x264_dct4_weight2_zigzag[h->mb.b_interlaced], + x264_zigzag_scan4[h->mb.b_interlaced], + i_ctxBlockCat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, 0, 16, idx, 0 ); } int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int b_intra, int idx ) { - return quant_trellis_cabac( h, dct, - h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], - x264_dct8_weight2_zigzag[h->mb.b_interlaced], - x264_zigzag_scan8[h->mb.b_interlaced], - DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx ); -} + if( h->param.b_cabac ) + { + return quant_trellis_cabac( h, dct, + h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], + x264_dct8_weight2_zigzag[h->mb.b_interlaced], + x264_zigzag_scan8[h->mb.b_interlaced], + DCT_LUMA_8x8, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 64, idx ); + } + /* 8x8 CAVLC is split into 4 4x4 blocks */ + int nzaccum = 0; + for( int i = 0; i < 4; i++ ) + { + int nz = quant_trellis_cavlc( h, dct, + h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], + x264_dct8_weight2_zigzag[h->mb.b_interlaced], + x264_zigzag_scan8[h->mb.b_interlaced], + DCT_LUMA_4x4, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 16, idx*4+i, 1 ); + /* Set up nonzero count for future calls */ + h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz; + nzaccum |= nz; + } + return nzaccum; +} diff --git a/x264.c b/x264.c index 9c3ce5ef..7d985187 100644 --- a/x264.c +++ b/x264.c @@ -595,7 +595,7 @@ static void Help( x264_param_t *defaults, int longhelp ) H2( " --no-mixed-refs Don't decide references on a per partition basis\n" ); H2( " --no-chroma-me Ignore chroma in motion estimation\n" ); H1( " --no-8x8dct Disable adaptive spatial transform size\n" ); - H1( " -t, --trellis Trellis RD quantization. Requires CABAC. [%d]\n" + H1( " -t, --trellis Trellis RD quantization. [%d]\n" " - 0: disabled\n" " - 1: enabled only on the final encode of a MB\n" " - 2: enabled on all mode decisions\n", defaults->analyse.i_trellis );