+#define SIGN(x,y) ((x^(y >> 31))-(y >> 31))
+
+#define SET_LEVEL(ndst, nsrc, l) {\
+ if( sizeof(trellis_level_t) == sizeof(uint32_t) )\
+ M32( &level_tree[levels_used] ) = pack16to32( nsrc.level_idx, l );\
+ else\
+ level_tree[levels_used] = (trellis_level_t){ nsrc.level_idx, l };\
+ ndst.level_idx = levels_used;\
+ levels_used++;\
+}
+
+// encode all values of the dc coef in a block which is known to have no ac
+static NOINLINE
+int trellis_dc_shortcut( int sign_coef, int quant_coef, int unquant_mf, int coef_weight, int lambda2, uint8_t *cabac_state, int cost_sig )
+{
+ uint64_t bscore = TRELLIS_SCORE_MAX;
+ int ret = 0;
+ int q = abs( quant_coef );
+ for( int abs_level = q-1; abs_level <= q; abs_level++ )
+ {
+ int unquant_abs_level = (unquant_mf * abs_level + 128) >> 8;
+
+ /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */
+ int d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);
+ uint64_t score = (uint64_t)d*d * coef_weight;
+
+ /* code the proposed level, and count how much entropy it would take */
+ if( abs_level )
+ {
+ unsigned f8_bits = cost_sig;
+ int prefix = X264_MIN( abs_level - 1, 14 );
+ f8_bits += x264_cabac_size_decision_noup2( cabac_state+1, prefix > 0 );
+ f8_bits += x264_cabac_size_unary[prefix][cabac_state[5]];
+ if( abs_level >= 15 )
+ f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
+ score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+ }
+
+ COPY2_IF_LT( bscore, score, ret, abs_level );
+ }
+ return SIGN(ret, sign_coef);
+}
+
+// encode one value of one coef in one context
+static ALWAYS_INLINE
+int trellis_coef( int j, int const_level, int abs_level, int prefix, int suffix_cost,
+ int node_ctx, int level1_ctx, int levelgt1_ctx, uint64_t ssd, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state )
+{
+ uint64_t score = nodes_prev[j].score + ssd;
+ /* code the proposed level, and count how much entropy it would take */
+ unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
+ uint8_t level1_state = (j >= 3) ? nodes_prev[j].cabac_state[level1_ctx>>2] : level_state[level1_ctx];
+ f8_bits += x264_cabac_entropy[level1_state ^ (const_level > 1)];
+ uint8_t levelgt1_state;
+ if( const_level > 1 )
+ {
+ levelgt1_state = j >= 6 ? nodes_prev[j].cabac_state[levelgt1_ctx-6] : level_state[levelgt1_ctx];
+ f8_bits += x264_cabac_size_unary[prefix][levelgt1_state] + suffix_cost;
+ }
+ else
+ f8_bits += 1 << CABAC_SIZE_BITS;
+ score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
+
+ /* save the node if it's better than any existing node with the same cabac ctx */
+ if( score < nodes_cur[node_ctx].score )
+ {
+ nodes_cur[node_ctx].score = score;
+ if( j == 2 || (j <= 3 && node_ctx == 4) ) // init from input state
+ M32(nodes_cur[node_ctx].cabac_state) = M32(level_state+12);
+ else if( j >= 3 )
+ M32(nodes_cur[node_ctx].cabac_state) = M32(nodes_prev[j].cabac_state);
+ if( j >= 3 ) // skip the transition if we're not going to reuse the context
+ nodes_cur[node_ctx].cabac_state[level1_ctx>>2] = x264_cabac_transition[level1_state][const_level > 1];
+ if( const_level > 1 && node_ctx == 7 )
+ nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = x264_cabac_transition_unary[prefix][levelgt1_state];
+ nodes_cur[node_ctx].level_idx = nodes_prev[j].level_idx;
+ SET_LEVEL( nodes_cur[node_ctx], nodes_prev[j], abs_level );
+ }
+ return levels_used;
+}
+
+// encode one value of one coef in all contexts, templated by which value that is.
+// in ctx_lo, the set of live nodes is contiguous and starts at ctx0, so return as soon as we've seen one failure.
+// in ctx_hi, they're contiguous within each block of 4 ctxs, but not necessarily starting at the beginning,
+// so exploiting that would be more complicated.
+static NOINLINE
+int trellis_coef0_0( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used )
+{
+ nodes_cur[0].score = nodes_prev[0].score + ssd0;
+ nodes_cur[0].level_idx = nodes_prev[0].level_idx;
+ for( int j = 1; j < 4 && (int64_t)nodes_prev[j].score >= 0; j++ )
+ {
+ nodes_cur[j].score = nodes_prev[j].score;
+ if( j >= 3 )
+ M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
+ SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
+ }
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coef0_1( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used )
+{
+ for( int j = 1; j < 8; j++ )
+ // this branch only affects speed, not function; there's nothing wrong with updating invalid nodes in coef0.
+ if( (int64_t)nodes_prev[j].score >= 0 )
+ {
+ nodes_cur[j].score = nodes_prev[j].score;
+ if( j >= 3 )
+ M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state);
+ SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 );
+ }
+ return levels_used;
+}
+
+#define COEF(const_level, ctx_hi, j, ...)\
+ if( !j || (int64_t)nodes_prev[j].score >= 0 )\
+ levels_used = trellis_coef( j, const_level, abs_level, prefix, suffix_cost, __VA_ARGS__,\
+ j?ssd1:ssd0, cost_siglast, nodes_cur, nodes_prev,\
+ level_tree, levels_used, lambda2, level_state );\
+ else if( !ctx_hi )\
+ return levels_used;
+
+static NOINLINE
+int trellis_coef1_0( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state )
+{
+ int abs_level = 1, prefix = 1, suffix_cost = 0;
+ COEF( 1, 0, 0, 1, 1, 0 );
+ COEF( 1, 0, 1, 2, 2, 0 );
+ COEF( 1, 0, 2, 3, 3, 0 );
+ COEF( 1, 0, 3, 3, 4, 0 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coef1_1( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state )
+{
+ int abs_level = 1, prefix = 1, suffix_cost = 0;
+ COEF( 1, 1, 1, 2, 2, 0 );
+ COEF( 1, 1, 2, 3, 3, 0 );
+ COEF( 1, 1, 3, 3, 4, 0 );
+ COEF( 1, 1, 4, 4, 0, 0 );
+ COEF( 1, 1, 5, 5, 0, 0 );
+ COEF( 1, 1, 6, 6, 0, 0 );
+ COEF( 1, 1, 7, 7, 0, 0 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coefn_0( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state, int levelgt1_ctx )
+{
+ int prefix = X264_MIN( abs_level-1, 14 );
+ int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
+ COEF( 2, 0, 0, 4, 1, 5 );
+ COEF( 2, 0, 1, 4, 2, 5 );
+ COEF( 2, 0, 2, 4, 3, 5 );
+ COEF( 2, 0, 3, 4, 4, 5 );
+ return levels_used;
+}
+
+static NOINLINE
+int trellis_coefn_1( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3],
+ trellis_node_t *nodes_cur, trellis_node_t *nodes_prev,
+ trellis_level_t *level_tree, int levels_used, int lambda2,
+ uint8_t *level_state, int levelgt1_ctx )
+{
+ int prefix = X264_MIN( abs_level-1, 14 );
+ int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0;
+ COEF( 2, 1, 1, 4, 2, 5 );
+ COEF( 2, 1, 2, 4, 3, 5 );
+ COEF( 2, 1, 3, 4, 4, 5 );
+ COEF( 2, 1, 4, 5, 0, 6 );
+ COEF( 2, 1, 5, 6, 0, 7 );
+ COEF( 2, 1, 6, 7, 0, 8 );
+ COEF( 2, 1, 7, 7, 0, levelgt1_ctx );
+ return levels_used;
+}
+