+ /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
+ int d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);
+ uint64_t score = (uint64_t)d*d * coef_weight;
+
+ /* code the proposed level, and count how much entropy it would take */
+ if( abs_level )
+ {
+ unsigned f8_bits = cost_sig;
+ int prefix = X264_MIN( abs_level - 1, 14 );
+ f8_bits += x264_cabac_size_decision_noup2( cabac_state+1, prefix > 0 );
+ f8_bits += x264_cabac_size_unary[prefix][cabac_state[5]];
+ if( abs_level >= 15 )
+ f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
+ score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+ }
+
+ COPY2_IF_LT( bscore, score, ret, abs_level );
+ }
+ return SIGN(ret, sign_coef);
+}
+
+// encode one value of one coef in one context
+static ALWAYS_INLINE
+int trellis_coef( int j, int const_level, int abs_level, int prefix, int suffix_cost,
+ int node_ctx, int level1_ctx, int levelgt1_ctx, uint64_t ssd, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state )
+{
+ uint64_t score = nodes_prev[j].score + ssd;
+ /* code the proposed level, and count how much entropy it would take */
+ unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
+ uint8_t level1_state = (j >= 3) ? nodes_prev[j].cabac_state[level1_ctx>>2] : level_state[level1_ctx];
+ f8_bits += x264_cabac_entropy[level1_state ^ (const_level > 1)];
+ uint8_t levelgt1_state;
+ if( const_level > 1 )
+ {
+ levelgt1_state = j >= 6 ? nodes_prev[j].cabac_state[levelgt1_ctx-6] : level_state[levelgt1_ctx];
+ f8_bits += x264_cabac_size_unary[prefix][levelgt1_state] + suffix_cost;
+ }
+ else
+ f8_bits += 1 << CABAC_SIZE_BITS;
+ score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+
+ /* save the node if it's better than any existing node with the same cabac ctx */
+ if( score < nodes_cur[node_ctx].score )
+ {
+ nodes_cur[node_ctx].score = score;
+ if( j == 2 || (j <= 3 && node_ctx == 4) ) // init from input state
+ M32(nodes_cur[node_ctx].cabac_state) = M32(level_state+12);
+ else if( j >= 3 )
+ M32(nodes_cur[node_ctx].cabac_state) = M32(nodes_prev[j].cabac_state);
+ if( j >= 3 ) // skip the transition if we're not going to reuse the context
+ nodes_cur[node_ctx].cabac_state[level1_ctx>>2] = x264_cabac_transition[level1_state][const_level > 1];
+ if( const_level > 1 && node_ctx == 7 )
+ nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = x264_cabac_transition_unary[prefix][levelgt1_state];
+ nodes_cur[node_ctx].level_idx = nodes_prev[j].level_idx;
+ SET_LEVEL( nodes_cur[node_ctx], nodes_prev[j], abs_level );
+ }
+ return levels_used;
+}
+
+// encode one value of one coef in all contexts, templated by which value that is.
+// in ctx_lo, the set of live nodes is contiguous and starts at ctx0, so return as soon as we've seen one failure.
+// in ctx_hi, they're contiguous within each block of 4 ctxs, but not necessarily starting at the beginning,
+// so exploiting that would be more complicated.
+static NOINLINE
+int trellis_coef0_0( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used )
+{
+ nodes_cur[0].score = nodes_prev[0].score + ssd0;
+ nodes_cur[0].level_idx = nodes_prev[0].level_idx;
+ for( int j = 1; j < 4 && (int64_t)nodes_prev[j].score >= 0; j++ )
+ {
+ nodes_cur[j].score = nodes_prev[j].score;
+ if( j >= 3 )
+ M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
+ SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
+ }
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coef0_1( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used )
+{
+ for( int j = 1; j < 8; j++ )
+ // this branch only affects speed, not function; there's nothing wrong with updating invalid nodes in coef0.
+ if( (int64_t)nodes_prev[j].score >= 0 )
+ {
+ nodes_cur[j].score = nodes_prev[j].score;
+ if( j >= 3 )
+ M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
+ SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
+ }
+ return levels_used;
+}
+
+#define COEF(const_level, ctx_hi, j, ...)\
+ if( !j || (int64_t)nodes_prev[j].score >= 0 )\
+ levels_used = trellis_coef( j, const_level, abs_level, prefix, suffix_cost, __VA_ARGS__,\
+ j?ssd1:ssd0, cost_siglast, nodes_cur, nodes_prev,\
+ level_tree, levels_used, lambda2, level_state );\
+ else if( !ctx_hi )\
+ return levels_used;
+
+static NOINLINE
+int trellis_coef1_0( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state )
+{
+ int abs_level = 1, prefix = 1, suffix_cost = 0;
+ COEF( 1, 0, 0, 1, 1, 0 );
+ COEF( 1, 0, 1, 2, 2, 0 );
+ COEF( 1, 0, 2, 3, 3, 0 );
+ COEF( 1, 0, 3, 3, 4, 0 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coef1_1( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state )
+{
+ int abs_level = 1, prefix = 1, suffix_cost = 0;
+ COEF( 1, 1, 1, 2, 2, 0 );
+ COEF( 1, 1, 2, 3, 3, 0 );
+ COEF( 1, 1, 3, 3, 4, 0 );
+ COEF( 1, 1, 4, 4, 0, 0 );
+ COEF( 1, 1, 5, 5, 0, 0 );
+ COEF( 1, 1, 6, 6, 0, 0 );
+ COEF( 1, 1, 7, 7, 0, 0 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coefn_0( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state, int levelgt1_ctx )
+{
+ int prefix = X264_MIN( abs_level-1, 14 );
+ int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
+ COEF( 2, 0, 0, 4, 1, 5 );
+ COEF( 2, 0, 1, 4, 2, 5 );
+ COEF( 2, 0, 2, 4, 3, 5 );
+ COEF( 2, 0, 3, 4, 4, 5 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coefn_1( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state, int levelgt1_ctx )
+{
+ int prefix = X264_MIN( abs_level-1, 14 );
+ int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
+ COEF( 2, 1, 1, 4, 2, 5 );
+ COEF( 2, 1, 2, 4, 3, 5 );
+ COEF( 2, 1, 3, 4, 4, 5 );
+ COEF( 2, 1, 4, 5, 0, 6 );
+ COEF( 2, 1, 5, 6, 0, 7 );
+ COEF( 2, 1, 6, 7, 0, 8 );
+ COEF( 2, 1, 7, 7, 0, levelgt1_ctx );
+ return levels_used;
+}