+ int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac;
+ uint8_t *cabac_state = &h->cabac.state[ x264_coeff_abs_level_m1_offset[ctx_block_cat] ];
+
+ /* shortcut for dc-only blocks.
+ * this doesn't affect the output, but saves some unnecessary computation. */
+ if( last_nnz == 0 && !dc )
+ {
+ int cost_sig = x264_cabac_size_decision_noup2( &cabac_state_sig[0], 1 )
+ + x264_cabac_size_decision_noup2( &cabac_state_last[0], 1 );
+ dct[0] = trellis_dc_shortcut( orig_coefs[0], quant_coefs[0], unquant_mf[0], coef_weight2[0], lambda2, cabac_state, cost_sig );
+ return !!dct[0];
+ }
+
+#if HAVE_MMX && ARCH_X86_64
+#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\
+ cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8)
+ if( num_coefs == 16 && !dc )
+ if( b_chroma || !h->mb.i_psy_trellis )
+ return h->quantf.trellis_cabac_4x4( TRELLIS_ARGS, b_ac );
+ else
+ return h->quantf.trellis_cabac_4x4_psy( TRELLIS_ARGS, b_ac, h->mb.pic.fenc_dct4[idx&15], h->mb.i_psy_trellis );
+ else if( num_coefs == 64 && !dc )
+ if( b_chroma || !h->mb.i_psy_trellis )
+ return h->quantf.trellis_cabac_8x8( TRELLIS_ARGS, b_interlaced );
+ else
+ return h->quantf.trellis_cabac_8x8_psy( TRELLIS_ARGS, b_interlaced, h->mb.pic.fenc_dct8[idx&3], h->mb.i_psy_trellis);
+ else if( num_coefs == 8 && dc )
+ return h->quantf.trellis_cabac_chroma_422_dc( TRELLIS_ARGS );
+ else if( dc )
+ return h->quantf.trellis_cabac_dc( TRELLIS_ARGS, num_coefs-1 );
+#endif
+
+ // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
+ // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
+ // but it takes more time to remove dead states than you gain in reduced memory.
+ trellis_level_t level_tree[64*8*2];
+ int levels_used = 1;
+ /* init trellis */
+ trellis_node_t nodes[2][8];
+ trellis_node_t *nodes_cur = nodes[0];
+ trellis_node_t *nodes_prev = nodes[1];
+ trellis_node_t *bnode;
+ for( int j = 1; j < 4; j++ )
+ nodes_cur[j].score = TRELLIS_SCORE_MAX;
+ nodes_cur[0].score = TRELLIS_SCORE_BIAS;
+ nodes_cur[0].level_idx = 0;
+ level_tree[0].abs_level = 0;
+ level_tree[0].next = 0;
+ ALIGNED_4( uint8_t level_state[16] );
+ memcpy( level_state, cabac_state, 10 );
+ level_state[12] = cabac_state[0]; // packed subset for copying into trellis_node_t
+ level_state[13] = cabac_state[4];
+ level_state[14] = cabac_state[8];
+ level_state[15] = cabac_state[9];
+
+ idx &= num_coefs == 64 ? 3 : 15;
+
+ // coefs are processed in reverse order, because that's how the abs value is coded.
+ // last_coef and significant_coef flags are normally coded in forward order, but
+ // we have to reverse them to match the levels.
+ // in 4x4 blocks, last_coef and significant_coef use a separate context for each
+ // position, so the order doesn't matter, and we don't even have to update their contexts.
+ // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
+ // cabac isn't too sensitive.
+ int i = last_nnz;
+#define TRELLIS_LOOP(ctx_hi)\
+ for( ; i >= b_ac; i-- )\
+ {\
+ /* skip 0s: this doesn't affect the output, but saves some unnecessary computation. */\
+ if( !quant_coefs[i] )\
+ {\
+ /* no need to calculate ssd of 0s: it's the same in all nodes.\
+ * no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
+ * subtracting from one score is equivalent to adding to the rest. */\
+ if( !ctx_hi )\
+ {\
+ int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\
+ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
+ uint64_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )\
+ * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\
+ nodes_cur[0].score -= cost_sig0;\
+ }\
+ for( int j = 1; j < (ctx_hi?8:4); j++ )\
+ SET_LEVEL( nodes_cur[j], nodes_cur[j], 0 );\
+ continue;\
+ }\
+\
+ int sign_coef = orig_coefs[zigzag[i]];\
+ int abs_coef = abs( sign_coef );\
+ int q = abs( quant_coefs[i] );\
+ int cost_siglast[3]; /* { zero, nonzero, nonzero-and-last } */\
+ XCHG( trellis_node_t*, nodes_cur, nodes_prev );\
+ for( int j = ctx_hi; j < 8; j++ )\
+ nodes_cur[j].score = TRELLIS_SCORE_MAX;\
+\
+ if( i < num_coefs-1 || ctx_hi )\
+ {\
+ int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\
+ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
+ int lastindex = !dc && num_coefs == 64 ? x264_last_coeff_flag_offset_8x8[i] :\
+ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\
+ cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );\
+ int cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );\
+ cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;\
+ if( !ctx_hi )\
+ cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;\
+ }\
+ else\
+ {\
+ cost_siglast[0] = cost_siglast[1] = cost_siglast[2] = 0;\
+ }\
+\
+ /* there are a few cases where increasing the coeff magnitude helps,\
+ * but it's only around .003 dB, and skipping them ~doubles the speed of trellis.\
+ * could also try q-2: that sometimes helps, but also sometimes decimates blocks\
+ * that are better left coded, especially at QP > 40. */\
+ uint64_t ssd0[2], ssd1[2];\
+ for( int k = 0; k < 2; k++ )\
+ {\
+ int abs_level = q-1+k;\
+ int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);\
+ int d = abs_coef - unquant_abs_level;\
+ /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */\
+ if( h->mb.i_psy_trellis && i && !dc && !b_chroma )\
+ {\
+ int orig_coef = (num_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];\
+ int predicted_coef = orig_coef - sign_coef;\
+ int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));\
+ int psy_weight = coef_weight1[zigzag[i]] * h->mb.i_psy_trellis;\
+ ssd1[k] = (uint64_t)d*d * coef_weight2[zigzag[i]] - psy_weight * psy_value;\
+ }\
+ else\
+ /* FIXME: for i16x16 dc is this weight optimal? */\
+ ssd1[k] = (uint64_t)d*d * (dc?256:coef_weight2[zigzag[i]]);\
+ ssd0[k] = ssd1[k];\
+ if( !i && !dc && !ctx_hi )\
+ {\
+ /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */\
+ d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);\
+ ssd0[k] = (uint64_t)d*d * coef_weight2[zigzag[i]];\
+ }\
+ }\
+\
+ /* argument passing imposes some significant overhead here. gcc's interprocedural register allocation isn't up to it. */\
+ switch( q )\
+ {\
+ case 1:\
+ ssd1[0] += (uint64_t)cost_siglast[0] * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\
+ levels_used = trellis_coef0_##ctx_hi( ssd0[0]-ssd1[0], nodes_cur, nodes_prev, level_tree, levels_used );\
+ levels_used = trellis_coef1_##ctx_hi( ssd0[1]-ssd1[0], ssd1[1]-ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\
+ goto next##ctx_hi;\
+ case 2:\
+ levels_used = trellis_coef1_##ctx_hi( ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\
+ levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
+ goto next1;\
+ default:\
+ levels_used = trellis_coefn_##ctx_hi( q-1, ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
+ levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\
+ goto next1;\
+ }\
+ next##ctx_hi:;\
+ }\
+ /* output levels from the best path through the trellis */\
+ bnode = &nodes_cur[ctx_hi];\
+ for( int j = ctx_hi+1; j < (ctx_hi?8:4); j++ )\
+ if( nodes_cur[j].score < bnode->score )\