1 /*****************************************************************************
2 * rdo.c: h264 encoder library (rate-distortion optimization)
3 *****************************************************************************
4 * Copyright (C) 2005-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Fiona Glaser <fiona@x264.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
24 /* duplicate all the writer functions, just calculating bit cost
25 * instead of writing the bitstream.
26 * TODO: use these for fast 1st pass too. */
30 /* Transition and size tables for abs<9 MVD and residual coding */
31 /* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
32 static uint8_t cabac_transition_unary[15][128];
33 static uint16_t cabac_size_unary[15][128];
34 /* Transition and size tables for abs>9 MVD */
35 /* Consist of 5 1s and a bypass sign bit */
36 static uint8_t cabac_transition_5ones[128];
37 static uint16_t cabac_size_5ones[128];
39 /* CAVLC: produces exactly the same bit count as a normal encode */
40 /* this probably still leaves some unnecessary computations */
41 #define bs_write1(s,v) ((s)->i_bits_encoded += 1)
42 #define bs_write(s,n,v) ((s)->i_bits_encoded += (n))
43 #define bs_write_ue(s,v) ((s)->i_bits_encoded += bs_size_ue(v))
44 #define bs_write_se(s,v) ((s)->i_bits_encoded += bs_size_se(v))
45 #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
46 #define x264_macroblock_write_cavlc static x264_macroblock_size_cavlc
49 /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
50 * fractional bits, but only finite precision. */
51 #undef x264_cabac_encode_decision
52 #undef x264_cabac_encode_decision_noup
53 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
54 #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
55 #define x264_cabac_encode_terminal(c) x264_cabac_size_decision_noup(c,276,0)
56 #define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256)
57 #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
58 #define x264_cabac_encode_flush(h,c)
59 #define x264_macroblock_write_cabac static x264_macroblock_size_cabac
62 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
63 sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
66 /* Sum the cached SATDs to avoid repeating them. */
67 static inline int sum_satd( x264_t *h, int pixel, int x, int y )
72 int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
73 int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
74 if( pixel == PIXEL_16x16 )
75 return h->mb.pic.fenc_satd_sum;
76 for( y = min_y; y < max_y; y++ )
77 for( x = min_x; x < max_x; x++ )
78 satd += h->mb.pic.fenc_satd[y][x];
82 static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
87 int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
88 int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
89 if( pixel == PIXEL_16x16 )
90 return h->mb.pic.fenc_sa8d_sum;
91 for( y = min_y; y < max_y; y++ )
92 for( x = min_x; x < max_x; x++ )
93 sa8d += h->mb.pic.fenc_sa8d[y][x];
97 /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
98 /* SATD and SA8D are used to measure block complexity. */
99 /* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */
100 /* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
102 /* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
103 /* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
104 /* This optimization can also be used in non-RD transform decision. */
106 static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
108 DECLARE_ALIGNED_16(static uint8_t zero[16]);
110 uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
111 uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
112 if( p == 0 && h->mb.i_psy_rd )
114 /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
115 if( size <= PIXEL_8x8 )
117 uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
118 satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
119 + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
124 int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
125 satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
127 satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
129 return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
132 static inline int ssd_mb( x264_t *h )
134 return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
135 + ssd_plane(h, PIXEL_8x8, 1, 0, 0)
136 + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
139 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
141 int b_transform_bak = h->mb.b_transform_8x8;
145 x264_macroblock_encode( h );
149 if( IS_SKIP( h->mb.i_type ) )
151 i_bits = (1 * i_lambda2 + 128) >> 8;
153 else if( h->param.b_cabac )
155 x264_cabac_t cabac_tmp;
157 x264_macroblock_size_cabac( h, &cabac_tmp );
158 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
162 bs_t bs_tmp = h->out.bs;
163 bs_tmp.i_bits_encoded = 0;
164 x264_macroblock_size_cavlc( h, &bs_tmp );
165 i_bits = ( bs_tmp.i_bits_encoded * i_lambda2 + 128 ) >> 8;
168 h->mb.b_transform_8x8 = b_transform_bak;
170 return i_ssd + i_bits;
173 /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
175 static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
177 uint64_t i_ssd, i_bits;
179 x264_macroblock_encode_p4x4( h, i4 );
180 if( i_pixel == PIXEL_8x4 )
181 x264_macroblock_encode_p4x4( h, i4+1 );
182 if( i_pixel == PIXEL_4x8 )
183 x264_macroblock_encode_p4x4( h, i4+2 );
185 i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
187 if( h->param.b_cabac )
189 x264_cabac_t cabac_tmp;
191 x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
192 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
196 i_bits = x264_subpartition_size_cavlc( h, i4, i_pixel );
199 return (i_ssd<<8) + i_bits;
202 uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
204 uint64_t i_ssd, i_bits;
207 if( i_pixel == PIXEL_16x16 )
209 int type_bak = h->mb.i_type;
210 int i_cost = x264_rd_cost_mb( h, i_lambda2 );
211 h->mb.i_type = type_bak;
215 if( i_pixel > PIXEL_8x8 )
216 return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );
218 h->mb.i_cbp_luma = 0;
220 x264_macroblock_encode_p8x8( h, i8 );
221 if( i_pixel == PIXEL_16x8 )
222 x264_macroblock_encode_p8x8( h, i8+1 );
223 if( i_pixel == PIXEL_8x16 )
224 x264_macroblock_encode_p8x8( h, i8+2 );
226 i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 )
227 + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
228 + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
230 if( h->param.b_cabac )
232 x264_cabac_t cabac_tmp;
234 x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
235 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
239 i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
242 return (i_ssd<<8) + i_bits;
245 static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
247 uint64_t i_ssd, i_bits;
248 h->mb.i_cbp_luma &= ~(1<<i8);
249 h->mb.b_transform_8x8 = 1;
251 x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
252 i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
254 if( h->param.b_cabac )
256 x264_cabac_t cabac_tmp;
258 x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
259 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
263 i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
266 return (i_ssd<<8) + i_bits;
269 static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
271 uint64_t i_ssd, i_bits;
273 x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
274 i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
276 if( h->param.b_cabac )
278 x264_cabac_t cabac_tmp;
280 x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
281 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
285 i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
288 return (i_ssd<<8) + i_bits;
291 static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
293 uint64_t i_ssd, i_bits;
296 x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
297 i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
298 ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
300 h->mb.i_chroma_pred_mode = i_mode;
302 if( h->param.b_cabac )
304 x264_cabac_t cabac_tmp;
306 x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
307 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
311 i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
314 return (i_ssd<<8) + i_bits;
316 /****************************************************************************
317 * Trellis RD quantization
318 ****************************************************************************/
320 #define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
321 #define CABAC_SIZE_BITS 8
322 #define SSD_WEIGHT_BITS 5
323 #define LAMBDA_BITS 4
325 /* precalculate the cost of coding various combinations of bits in a single context */
326 void x264_rdo_init( void )
328 int i_prefix, i_ctx, i;
329 for( i_prefix = 0; i_prefix < 15; i_prefix++ )
331 for( i_ctx = 0; i_ctx < 128; i_ctx++ )
336 for( i = 1; i < i_prefix; i++ )
337 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
338 if( i_prefix > 0 && i_prefix < 14 )
339 f8_bits += x264_cabac_size_decision2( &ctx, 0 );
340 f8_bits += 1 << CABAC_SIZE_BITS; //sign
342 cabac_size_unary[i_prefix][i_ctx] = f8_bits;
343 cabac_transition_unary[i_prefix][i_ctx] = ctx;
346 for( i_ctx = 0; i_ctx < 128; i_ctx++ )
351 for( i = 0; i < 5; i++ )
352 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
353 f8_bits += 1 << CABAC_SIZE_BITS; //sign
355 cabac_size_5ones[i_ctx] = f8_bits;
356 cabac_transition_5ones[i_ctx] = ctx;
360 // should the intra and inter lambdas be different?
361 // I'm just matching the behaviour of deadzone quant.
362 static const int lambda2_tab[2][52] = {
363 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
364 { 46, 58, 73, 92, 117, 147,
365 185, 233, 294, 370, 466, 587,
366 740, 932, 1174, 1480, 1864, 2349,
367 2959, 3728, 4697, 5918, 7457, 9395,
368 11837, 14914, 18790, 23674, 29828, 37581,
369 47349, 59656, 75163, 94699, 119313, 150326,
370 189399, 238627, 300652, 378798, 477255, 601304,
371 757596, 954511, 1202608, 1515192, 1909022, 2405217,
372 3030384, 3818045, 4810435, 6060769 },
373 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
374 { 27, 34, 43, 54, 68, 86,
375 108, 136, 172, 216, 273, 343,
376 433, 545, 687, 865, 1090, 1374,
377 1731, 2180, 2747, 3461, 4361, 5494,
378 6922, 8721, 10988, 13844, 17442, 21976,
379 27688, 34885, 43953, 55377, 69771, 87906,
380 110755, 139543, 175813, 221511, 279087, 351627,
381 443023, 558174, 703255, 886046, 1116348, 1406511,
382 1772093, 2232697, 2813022, 3544186 }
387 int level_idx; // index into level_tree[]
388 uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
392 // save cabac state between blocks?
393 // use trellis' RD score instead of x264_mb_decimate_score?
394 // code 8x8 sig/last flags forwards with deadzone and save the contexts at
396 // change weights when using CQMs?
398 // possible optimizations:
399 // make scores fit in 32bit
400 // save quantized coefs during rd, to avoid a duplicate trellis in the final encode
401 // if trellissing all MBRD modes, finish SSD calculation so we can skip all of
402 // the normal dequant/idct/ssd/cabac
404 // the unquant_mf here is not the same as dequant_mf:
405 // in normal operation (dct->quant->dequant->idct) the dct and idct are not
406 // normalized. quant/dequant absorb those scaling factors.
407 // in this function, we just do (quant->unquant) and want the output to be
408 // comparable to the input. so unquant is the direct inverse of quant,
409 // and uses the dct scaling factors, not the idct ones.
411 static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct,
412 const uint16_t *quant_mf, const int *unquant_mf,
413 const int *coef_weight, const uint8_t *zigzag,
414 int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
416 int abs_coefs[64], signs[64];
417 trellis_node_t nodes[2][8];
418 trellis_node_t *nodes_cur = nodes[0];
419 trellis_node_t *nodes_prev = nodes[1];
420 trellis_node_t *bnode;
421 uint8_t cabac_state_sig[64];
422 uint8_t cabac_state_last[64];
423 const int b_interlaced = h->mb.b_interlaced;
424 const int f = 1 << 15; // no deadzone
428 // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
429 // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
430 // but it takes more time to remove dead states than you gain in reduced memory.
434 } level_tree[64*8*2];
435 int i_levels_used = 1;
438 for( i = i_coefs-1; i >= b_ac; i-- )
439 if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
444 memset( dct, 0, i_coefs * sizeof(*dct) );
450 for( ; i >= b_ac; i-- )
452 int coef = dct[zigzag[i]];
453 abs_coefs[i] = abs(coef);
454 signs[i] = coef < 0 ? -1 : 1;
458 for( i = 1; i < 8; i++ )
459 nodes_cur[i].score = TRELLIS_SCORE_MAX;
460 nodes_cur[0].score = 0;
461 nodes_cur[0].level_idx = 0;
462 level_tree[0].abs_level = 0;
463 level_tree[0].next = 0;
465 // coefs are processed in reverse order, because that's how the abs value is coded.
466 // last_coef and significant_coef flags are normally coded in forward order, but
467 // we have to reverse them to match the levels.
468 // in 4x4 blocks, last_coef and significant_coef use a separate context for each
469 // position, so the order doesn't matter, and we don't even have to update their contexts.
470 // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
471 // cabac isn't too sensitive.
475 const uint8_t *ctx_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
476 const uint8_t *ctx_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
477 for( i = 0; i < 63; i++ )
479 cabac_state_sig[i] = ctx_sig[ significant_coeff_flag_offset_8x8[b_interlaced][i] ];
480 cabac_state_last[i] = ctx_last[ last_coeff_flag_offset_8x8[i] ];
483 else if( !dc || i_ctxBlockCat != DCT_CHROMA_DC )
485 memcpy( cabac_state_sig, &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
486 memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
490 memcpy( cabac_state_sig, &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
491 memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
493 memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );
495 for( i = i_last_nnz; i >= b_ac; i-- )
497 int i_coef = abs_coefs[i];
498 int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16;
500 int cost_sig[2], cost_last[2];
503 // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
506 // no need to calculate ssd of 0s: it's the same in all nodes.
507 // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
508 const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 0 )
509 * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
510 for( j = 1; j < 8; j++ )
512 if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
514 #define SET_LEVEL(n,l) \
515 level_tree[i_levels_used].abs_level = l; \
516 level_tree[i_levels_used].next = n.level_idx; \
517 n.level_idx = i_levels_used; \
520 SET_LEVEL( nodes_cur[j], 0 );
521 nodes_cur[j].score += cost_sig0;
527 XCHG( trellis_node_t*, nodes_cur, nodes_prev );
529 for( j = 0; j < 8; j++ )
530 nodes_cur[j].score = TRELLIS_SCORE_MAX;
534 cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 0 );
535 cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 1 );
536 cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[i], 0 );
537 cost_last[1] = x264_cabac_size_decision_noup2( &cabac_state_last[i], 1 );
541 cost_sig[0] = cost_sig[1] = 0;
542 cost_last[0] = cost_last[1] = 0;
545 // there are a few cases where increasing the coeff magnitude helps,
546 // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
547 // could also try q-2: that sometimes helps, but also sometimes decimates blocks
548 // that are better left coded, especially at QP > 40.
549 for( abs_level = q; abs_level >= q-1; abs_level-- )
551 int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);
552 int d = i_coef - unquant_abs_level;
554 /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
555 if( h->mb.i_psy_trellis && i && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
557 int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
558 int predicted_coef = orig_coef - i_coef * signs[i];
559 int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
560 int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
561 ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
564 /* FIXME: for i16x16 dc is this weight optimal? */
565 ssd = (int64_t)d*d * (dc?256:coef_weight[i]);
567 for( j = 0; j < 8; j++ )
570 if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
574 /* code the proposed level, and count how much entropy it would take */
575 if( abs_level || node_ctx )
577 unsigned f8_bits = cost_sig[ abs_level != 0 ];
580 const int i_prefix = X264_MIN( abs_level - 1, 14 );
581 f8_bits += cost_last[ node_ctx == 0 ];
582 f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
585 uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
586 f8_bits += cabac_size_unary[i_prefix][*ctx];
587 *ctx = cabac_transition_unary[i_prefix][*ctx];
588 if( abs_level >= 15 )
589 f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
590 node_ctx = coeff_abs_level_transition[1][node_ctx];
594 f8_bits += 1 << CABAC_SIZE_BITS;
595 node_ctx = coeff_abs_level_transition[0][node_ctx];
598 n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
603 /* save the node if it's better than any existing node with the same cabac ctx */
604 if( n.score < nodes_cur[node_ctx].score )
606 SET_LEVEL( n, abs_level );
607 nodes_cur[node_ctx] = n;
613 /* output levels from the best path through the trellis */
614 bnode = &nodes_cur[0];
615 for( j = 1; j < 8; j++ )
616 if( nodes_cur[j].score < bnode->score )
617 bnode = &nodes_cur[j];
619 j = bnode->level_idx;
621 for( i = b_ac; i < i_coefs; i++ )
623 dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
624 nz |= level_tree[j].abs_level;
625 j = level_tree[j].next;
630 const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
632 int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
633 int i_qp, int i_ctxBlockCat, int b_intra )
635 return quant_trellis_cabac( h, (int16_t*)dct,
636 h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
637 NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
638 i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
641 int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
642 int i_qp, int i_ctxBlockCat, int b_intra, int idx )
644 int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
645 return quant_trellis_cabac( h, (int16_t*)dct,
646 h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
647 x264_dct4_weight2_zigzag[h->mb.b_interlaced],
648 x264_zigzag_scan4[h->mb.b_interlaced],
649 i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
652 int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
653 int i_qp, int b_intra, int idx )
655 return quant_trellis_cabac( h, (int16_t*)dct,
656 h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
657 x264_dct8_weight2_zigzag[h->mb.b_interlaced],
658 x264_zigzag_scan8[h->mb.b_interlaced],
659 DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 0, 64, idx );