+ levelgt1_state = j >= 6 ? nodes_prev[j].cabac_state[levelgt1_ctx-6] : level_state[levelgt1_ctx];
+ f8_bits += x264_cabac_size_unary[prefix][levelgt1_state] + suffix_cost;
+ }
+ else
+ f8_bits += 1 << CABAC_SIZE_BITS;
+ score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+
+ /* save the node if it's better than any existing node with the same cabac ctx */
+ if( score < nodes_cur[node_ctx].score )
+ {
+ nodes_cur[node_ctx].score = score;
+ if( j == 2 || (j <= 3 && node_ctx == 4) ) // init from input state
+ M32(nodes_cur[node_ctx].cabac_state) = M32(level_state+12);
+ else if( j >= 3 )
+ M32(nodes_cur[node_ctx].cabac_state) = M32(nodes_prev[j].cabac_state);
+ if( j >= 3 ) // skip the transition if we're not going to reuse the context
+ nodes_cur[node_ctx].cabac_state[level1_ctx>>2] = x264_cabac_transition[level1_state][const_level > 1];
+ if( const_level > 1 && node_ctx == 7 )
+ nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = x264_cabac_transition_unary[prefix][levelgt1_state];
+ nodes_cur[node_ctx].level_idx = nodes_prev[j].level_idx;
+ SET_LEVEL( nodes_cur[node_ctx], nodes_prev[j], abs_level );
+ }
+ return levels_used;
+}
+
+// encode one value of one coef in all contexts, templated by which value that is.
+// in ctx_lo, the set of live nodes is contiguous and starts at ctx0, so return as soon as we've seen one failure.
+// in ctx_hi, they're contiguous within each block of 4 ctxs, but not necessarily starting at the beginning,
+// so exploiting that would be more complicated.
+static NOINLINE
+int trellis_coef0_0( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used )
+{
+ nodes_cur[0].score = nodes_prev[0].score + ssd0;
+ nodes_cur[0].level_idx = nodes_prev[0].level_idx;
+ for( int j = 1; j < 4 && (int64_t)nodes_prev[j].score >= 0; j++ )
+ {
+ nodes_cur[j].score = nodes_prev[j].score;
+ if( j >= 3 )
+ M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
+ SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
+ }
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coef0_1( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used )
+{
+ for( int j = 1; j < 8; j++ )
+ // this branch only affects speed, not function; there's nothing wrong with updating invalid nodes in coef0.
+ if( (int64_t)nodes_prev[j].score >= 0 )
+ {
+ nodes_cur[j].score = nodes_prev[j].score;
+ if( j >= 3 )
+ M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
+ SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
+ }
+ return levels_used;
+}
+
+#define COEF(const_level, ctx_hi, j, ...)\
+ if( !j || (int64_t)nodes_prev[j].score >= 0 )\
+ levels_used = trellis_coef( j, const_level, abs_level, prefix, suffix_cost, __VA_ARGS__,\
+ j?ssd1:ssd0, cost_siglast, nodes_cur, nodes_prev,\
+ level_tree, levels_used, lambda2, level_state );\
+ else if( !ctx_hi )\
+ return levels_used;
+
+static NOINLINE
+int trellis_coef1_0( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state )
+{
+ int abs_level = 1, prefix = 1, suffix_cost = 0;
+ COEF( 1, 0, 0, 1, 1, 0 );
+ COEF( 1, 0, 1, 2, 2, 0 );
+ COEF( 1, 0, 2, 3, 3, 0 );
+ COEF( 1, 0, 3, 3, 4, 0 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coef1_1( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state )
+{
+ int abs_level = 1, prefix = 1, suffix_cost = 0;
+ COEF( 1, 1, 1, 2, 2, 0 );
+ COEF( 1, 1, 2, 3, 3, 0 );
+ COEF( 1, 1, 3, 3, 4, 0 );
+ COEF( 1, 1, 4, 4, 0, 0 );
+ COEF( 1, 1, 5, 5, 0, 0 );
+ COEF( 1, 1, 6, 6, 0, 0 );
+ COEF( 1, 1, 7, 7, 0, 0 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coefn_0( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state, int levelgt1_ctx )
+{
+ int prefix = X264_MIN( abs_level-1, 14 );
+ int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
+ COEF( 2, 0, 0, 4, 1, 5 );
+ COEF( 2, 0, 1, 4, 2, 5 );
+ COEF( 2, 0, 2, 4, 3, 5 );
+ COEF( 2, 0, 3, 4, 4, 5 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coefn_1( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state, int levelgt1_ctx )
+{
+ int prefix = X264_MIN( abs_level-1, 14 );
+ int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
+ COEF( 2, 1, 1, 4, 2, 5 );
+ COEF( 2, 1, 2, 4, 3, 5 );
+ COEF( 2, 1, 3, 4, 4, 5 );
+ COEF( 2, 1, 4, 5, 0, 6 );
+ COEF( 2, 1, 5, 6, 0, 7 );
+ COEF( 2, 1, 6, 7, 0, 8 );
+ COEF( 2, 1, 7, 7, 0, levelgt1_ctx );
+ return levels_used;
+}
+
+static ALWAYS_INLINE
+int quant_trellis_cabac( x264_t *h, dctcoef *dct,
+ udctcoef *quant_mf, udctcoef *quant_bias, const int *unquant_mf,
+ const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
+ int b_chroma, int dc, int num_coefs, int idx )
+{
+ ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
+ ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
+ const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
+ const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
+ const int b_interlaced = MB_INTERLACED;
+ uint8_t *cabac_state_sig = &h->cabac.state[ x264_significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+ uint8_t *cabac_state_last = &h->cabac.state[ x264_last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+ int levelgt1_ctx = b_chroma && dc ? 8 : 9;
+
+ if( dc )
+ {
+ if( num_coefs == 16 )
+ {
+ memcpy( orig_coefs, dct, sizeof(dctcoef)*16 );
+ if( !h->quantf.quant_4x4_dc( dct, quant_mf[0] >> 1, quant_bias[0] << 1 ) )
+ return 0;
+ h->zigzagf.scan_4x4( quant_coefs, dct );
+ }
+ else
+ {
+ memcpy( orig_coefs, dct, sizeof(dctcoef)*num_coefs );
+ int nz = h->quantf.quant_2x2_dc( &dct[0], quant_mf[0] >> 1, quant_bias[0] << 1 );
+ if( num_coefs == 8 )
+ nz |= h->quantf.quant_2x2_dc( &dct[4], quant_mf[0] >> 1, quant_bias[0] << 1 );
+ if( !nz )
+ return 0;
+ for( int i = 0; i < num_coefs; i++ )
+ quant_coefs[i] = dct[zigzag[i]];
+ }
+ }
+ else
+ {
+ if( num_coefs == 64 )
+ {
+ h->mc.memcpy_aligned( orig_coefs, dct, sizeof(dctcoef)*64 );
+ if( !h->quantf.quant_8x8( dct, quant_mf, quant_bias ) )
+ return 0;
+ h->zigzagf.scan_8x8( quant_coefs, dct );
+ }
+ else //if( num_coefs == 16 )
+ {
+ memcpy( orig_coefs, dct, sizeof(dctcoef)*16 );
+ if( !h->quantf.quant_4x4( dct, quant_mf, quant_bias ) )
+ return 0;
+ h->zigzagf.scan_4x4( quant_coefs, dct );
+ }