1 /*****************************************************************************
2 * rdo.c: h264 encoder library (rate-distortion optimization)
3 *****************************************************************************
4 * Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *****************************************************************************/
21 /* duplicate all the writer functions, just calculating bit cost
22 * instead of writing the bitstream.
23 * TODO: use these for fast 1st pass too. */
27 /* Transition and size tables for abs<9 MVD and residual coding */
28 /* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
29 static uint8_t cabac_transition_unary[15][128];
30 static uint16_t cabac_size_unary[15][128];
31 /* Transition and size tables for abs>9 MVD */
32 /* Consist of 5 1s and a bypass sign bit */
33 static uint8_t cabac_transition_5ones[128];
34 static uint16_t cabac_size_5ones[128];
36 /* CAVLC: produces exactly the same bit count as a normal encode */
37 /* this probably still leaves some unnecessary computations */
38 #define bs_write1(s,v) ((s)->i_bits_encoded += 1)
39 #define bs_write(s,n,v) ((s)->i_bits_encoded += (n))
40 #define bs_write_ue(s,v) ((s)->i_bits_encoded += bs_size_ue(v))
41 #define bs_write_se(s,v) ((s)->i_bits_encoded += bs_size_se(v))
42 #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
43 #define x264_macroblock_write_cavlc static x264_macroblock_size_cavlc
46 /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
47 * fractional bits, but only finite precision. */
48 #undef x264_cabac_encode_decision
49 #undef x264_cabac_encode_decision_noup
50 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
51 #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
52 #define x264_cabac_encode_terminal(c) x264_cabac_size_decision_noup(c,276,0)
53 #define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256)
54 #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
55 #define x264_cabac_encode_flush(h,c)
56 #define x264_macroblock_write_cabac static x264_macroblock_size_cabac
59 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
60 sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
63 /* Sum the cached SATDs to avoid repeating them. */
64 static inline int sum_satd( x264_t *h, int pixel, int x, int y )
69 int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
70 int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
71 if( pixel == PIXEL_16x16 )
72 return h->mb.pic.fenc_satd_sum;
73 for( y = min_y; y < max_y; y++ )
74 for( x = min_x; x < max_x; x++ )
75 satd += h->mb.pic.fenc_satd[y][x];
79 static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
84 int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
85 int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
86 if( pixel == PIXEL_16x16 )
87 return h->mb.pic.fenc_sa8d_sum;
88 for( y = min_y; y < max_y; y++ )
89 for( x = min_x; x < max_x; x++ )
90 sa8d += h->mb.pic.fenc_sa8d[y][x];
94 /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
95 /* SATD and SA8D are used to measure block complexity. */
96 /* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */
97 /* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
99 /* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
100 /* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
101 /* This optimization can also be used in non-RD transform decision. */
103 static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
105 DECLARE_ALIGNED_16(static uint8_t zero[16]);
107 uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
108 uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
109 if( p == 0 && h->mb.i_psy_rd )
111 /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
112 if( size <= PIXEL_8x8 )
114 uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
115 satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
116 + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
121 int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
122 satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
124 satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
126 return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
129 static inline int ssd_mb( x264_t *h )
131 return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
132 + ssd_plane(h, PIXEL_8x8, 1, 0, 0)
133 + ssd_plane(h, PIXEL_8x8, 2, 0, 0);
136 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
138 int b_transform_bak = h->mb.b_transform_8x8;
142 x264_macroblock_encode( h );
146 if( IS_SKIP( h->mb.i_type ) )
148 i_bits = (1 * i_lambda2 + 128) >> 8;
150 else if( h->param.b_cabac )
152 x264_cabac_t cabac_tmp;
154 x264_macroblock_size_cabac( h, &cabac_tmp );
155 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
159 bs_t bs_tmp = h->out.bs;
160 bs_tmp.i_bits_encoded = 0;
161 x264_macroblock_size_cavlc( h, &bs_tmp );
162 i_bits = ( bs_tmp.i_bits_encoded * i_lambda2 + 128 ) >> 8;
165 h->mb.b_transform_8x8 = b_transform_bak;
167 return i_ssd + i_bits;
170 /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
172 static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
174 uint64_t i_ssd, i_bits;
176 x264_macroblock_encode_p4x4( h, i4 );
177 if( i_pixel == PIXEL_8x4 )
178 x264_macroblock_encode_p4x4( h, i4+1 );
179 if( i_pixel == PIXEL_4x8 )
180 x264_macroblock_encode_p4x4( h, i4+2 );
182 i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
184 if( h->param.b_cabac )
186 x264_cabac_t cabac_tmp;
188 x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
189 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
193 i_bits = x264_subpartition_size_cavlc( h, i4, i_pixel );
196 return (i_ssd<<8) + i_bits;
199 uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
201 uint64_t i_ssd, i_bits;
204 if( i_pixel == PIXEL_16x16 )
206 int type_bak = h->mb.i_type;
207 int i_cost = x264_rd_cost_mb( h, i_lambda2 );
208 h->mb.i_type = type_bak;
212 if( i_pixel > PIXEL_8x8 )
213 return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );
215 x264_macroblock_encode_p8x8( h, i8 );
216 if( i_pixel == PIXEL_16x8 )
217 x264_macroblock_encode_p8x8( h, i8+1 );
218 if( i_pixel == PIXEL_8x16 )
219 x264_macroblock_encode_p8x8( h, i8+2 );
221 i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 )
222 + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
223 + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
225 if( h->param.b_cabac )
227 x264_cabac_t cabac_tmp;
229 x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
230 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
234 i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
237 return (i_ssd<<8) + i_bits;
240 static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
242 uint64_t i_ssd, i_bits;
244 x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
245 i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
247 if( h->param.b_cabac )
249 x264_cabac_t cabac_tmp;
251 x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
252 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
256 i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
259 return (i_ssd<<8) + i_bits;
262 static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
264 uint64_t i_ssd, i_bits;
266 x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
267 i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
269 if( h->param.b_cabac )
271 x264_cabac_t cabac_tmp;
273 x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
274 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
278 i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
281 return (i_ssd<<8) + i_bits;
284 static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
286 uint64_t i_ssd, i_bits;
289 x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
290 i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
291 ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
293 h->mb.i_chroma_pred_mode = i_mode;
295 if( h->param.b_cabac )
297 x264_cabac_t cabac_tmp;
299 x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
300 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
304 i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
307 return (i_ssd<<8) + i_bits;
309 /****************************************************************************
310 * Trellis RD quantization
311 ****************************************************************************/
313 #define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
314 #define CABAC_SIZE_BITS 8
315 #define SSD_WEIGHT_BITS 5
316 #define LAMBDA_BITS 4
318 /* precalculate the cost of coding various combinations of bits in a single context */
319 void x264_rdo_init( void )
321 int i_prefix, i_ctx, i;
322 for( i_prefix = 0; i_prefix < 15; i_prefix++ )
324 for( i_ctx = 0; i_ctx < 128; i_ctx++ )
329 for( i = 1; i < i_prefix; i++ )
330 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
331 if( i_prefix > 0 && i_prefix < 14 )
332 f8_bits += x264_cabac_size_decision2( &ctx, 0 );
333 f8_bits += 1 << CABAC_SIZE_BITS; //sign
335 cabac_size_unary[i_prefix][i_ctx] = f8_bits;
336 cabac_transition_unary[i_prefix][i_ctx] = ctx;
339 for( i_ctx = 0; i_ctx < 128; i_ctx++ )
344 for( i = 0; i < 5; i++ )
345 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
346 f8_bits += 1 << CABAC_SIZE_BITS; //sign
348 cabac_size_5ones[i_ctx] = f8_bits;
349 cabac_transition_5ones[i_ctx] = ctx;
353 // should the intra and inter lambdas be different?
354 // I'm just matching the behaviour of deadzone quant.
355 static const int lambda2_tab[2][52] = {
356 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
357 { 46, 58, 73, 92, 117, 147,
358 185, 233, 294, 370, 466, 587,
359 740, 932, 1174, 1480, 1864, 2349,
360 2959, 3728, 4697, 5918, 7457, 9395,
361 11837, 14914, 18790, 23674, 29828, 37581,
362 47349, 59656, 75163, 94699, 119313, 150326,
363 189399, 238627, 300652, 378798, 477255, 601304,
364 757596, 954511, 1202608, 1515192, 1909022, 2405217,
365 3030384, 3818045, 4810435, 6060769 },
366 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
367 { 27, 34, 43, 54, 68, 86,
368 108, 136, 172, 216, 273, 343,
369 433, 545, 687, 865, 1090, 1374,
370 1731, 2180, 2747, 3461, 4361, 5494,
371 6922, 8721, 10988, 13844, 17442, 21976,
372 27688, 34885, 43953, 55377, 69771, 87906,
373 110755, 139543, 175813, 221511, 279087, 351627,
374 443023, 558174, 703255, 886046, 1116348, 1406511,
375 1772093, 2232697, 2813022, 3544186 }
380 int level_idx; // index into level_tree[]
381 uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
385 // save cabac state between blocks?
386 // use trellis' RD score instead of x264_mb_decimate_score?
387 // code 8x8 sig/last flags forwards with deadzone and save the contexts at
389 // change weights when using CQMs?
391 // possible optimizations:
392 // make scores fit in 32bit
393 // save quantized coefs during rd, to avoid a duplicate trellis in the final encode
394 // if trellissing all MBRD modes, finish SSD calculation so we can skip all of
395 // the normal dequant/idct/ssd/cabac
397 // the unquant_mf here is not the same as dequant_mf:
398 // in normal operation (dct->quant->dequant->idct) the dct and idct are not
399 // normalized. quant/dequant absorb those scaling factors.
400 // in this function, we just do (quant->unquant) and want the output to be
401 // comparable to the input. so unquant is the direct inverse of quant,
402 // and uses the dct scaling factors, not the idct ones.
404 static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
405 const uint16_t *quant_mf, const int *unquant_mf,
406 const int *coef_weight, const uint8_t *zigzag,
407 int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
409 int abs_coefs[64], signs[64];
410 trellis_node_t nodes[2][8];
411 trellis_node_t *nodes_cur = nodes[0];
412 trellis_node_t *nodes_prev = nodes[1];
413 trellis_node_t *bnode;
414 uint8_t cabac_state_sig[64];
415 uint8_t cabac_state_last[64];
416 const int b_interlaced = h->mb.b_interlaced;
417 const int f = 1 << 15; // no deadzone
421 // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
422 // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
423 // but it takes more time to remove dead states than you gain in reduced memory.
427 } level_tree[64*8*2];
428 int i_levels_used = 1;
431 for( i = i_coefs-1; i >= b_ac; i-- )
432 if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
437 memset( dct, 0, i_coefs * sizeof(*dct) );
443 for( ; i >= b_ac; i-- )
445 int coef = dct[zigzag[i]];
446 abs_coefs[i] = abs(coef);
447 signs[i] = coef < 0 ? -1 : 1;
451 for( i = 1; i < 8; i++ )
452 nodes_cur[i].score = TRELLIS_SCORE_MAX;
453 nodes_cur[0].score = 0;
454 nodes_cur[0].level_idx = 0;
455 level_tree[0].abs_level = 0;
456 level_tree[0].next = 0;
458 // coefs are processed in reverse order, because that's how the abs value is coded.
459 // last_coef and significant_coef flags are normally coded in forward order, but
460 // we have to reverse them to match the levels.
461 // in 4x4 blocks, last_coef and significant_coef use a separate context for each
462 // position, so the order doesn't matter, and we don't even have to update their contexts.
463 // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
464 // cabac isn't too sensitive.
468 const uint8_t *ctx_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
469 const uint8_t *ctx_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
470 for( i = 0; i < 63; i++ )
472 cabac_state_sig[i] = ctx_sig[ significant_coeff_flag_offset_8x8[b_interlaced][i] ];
473 cabac_state_last[i] = ctx_last[ last_coeff_flag_offset_8x8[i] ];
476 else if( !dc || i_ctxBlockCat != DCT_CHROMA_DC )
478 memcpy( cabac_state_sig, &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
479 memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
483 memcpy( cabac_state_sig, &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
484 memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
486 memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );
488 for( i = i_last_nnz; i >= b_ac; i-- )
490 int i_coef = abs_coefs[i];
491 int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16;
493 int cost_sig[2], cost_last[2];
496 // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
499 // no need to calculate ssd of 0s: it's the same in all nodes.
500 // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
501 const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 0 )
502 * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
503 for( j = 1; j < 8; j++ )
505 if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
507 #define SET_LEVEL(n,l) \
508 level_tree[i_levels_used].abs_level = l; \
509 level_tree[i_levels_used].next = n.level_idx; \
510 n.level_idx = i_levels_used; \
513 SET_LEVEL( nodes_cur[j], 0 );
514 nodes_cur[j].score += cost_sig0;
520 XCHG( trellis_node_t*, nodes_cur, nodes_prev );
522 for( j = 0; j < 8; j++ )
523 nodes_cur[j].score = TRELLIS_SCORE_MAX;
527 cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 0 );
528 cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 1 );
529 cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[i], 0 );
530 cost_last[1] = x264_cabac_size_decision_noup2( &cabac_state_last[i], 1 );
534 cost_sig[0] = cost_sig[1] = 0;
535 cost_last[0] = cost_last[1] = 0;
538 // there are a few cases where increasing the coeff magnitude helps,
539 // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
540 // could also try q-2: that sometimes helps, but also sometimes decimates blocks
541 // that are better left coded, especially at QP > 40.
542 for( abs_level = q; abs_level >= q-1; abs_level-- )
544 int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);
545 int d = i_coef - unquant_abs_level;
547 /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
548 if( h->mb.i_psy_trellis && i && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
550 int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
551 int predicted_coef = orig_coef - i_coef * signs[i];
552 int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
553 int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
554 ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
557 /* FIXME: for i16x16 dc is this weight optimal? */
558 ssd = (int64_t)d*d * (dc?256:coef_weight[i]);
560 for( j = 0; j < 8; j++ )
563 if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
567 /* code the proposed level, and count how much entropy it would take */
568 if( abs_level || node_ctx )
570 unsigned f8_bits = cost_sig[ abs_level != 0 ];
573 const int i_prefix = X264_MIN( abs_level - 1, 14 );
574 f8_bits += cost_last[ node_ctx == 0 ];
575 f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
578 uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
579 f8_bits += cabac_size_unary[i_prefix][*ctx];
580 *ctx = cabac_transition_unary[i_prefix][*ctx];
581 if( abs_level >= 15 )
582 f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
583 node_ctx = coeff_abs_level_transition[1][node_ctx];
587 f8_bits += 1 << CABAC_SIZE_BITS;
588 node_ctx = coeff_abs_level_transition[0][node_ctx];
591 n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
596 /* save the node if it's better than any existing node with the same cabac ctx */
597 if( n.score < nodes_cur[node_ctx].score )
599 SET_LEVEL( n, abs_level );
600 nodes_cur[node_ctx] = n;
606 /* output levels from the best path through the trellis */
607 bnode = &nodes_cur[0];
608 for( j = 1; j < 8; j++ )
609 if( nodes_cur[j].score < bnode->score )
610 bnode = &nodes_cur[j];
612 j = bnode->level_idx;
613 for( i = b_ac; i < i_coefs; i++ )
615 dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
616 j = level_tree[j].next;
620 const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
622 void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
623 int i_qp, int i_ctxBlockCat, int b_intra )
625 quant_trellis_cabac( h, (int16_t*)dct,
626 h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
627 NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
628 i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
631 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
632 int i_qp, int i_ctxBlockCat, int b_intra, int idx )
634 int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
635 quant_trellis_cabac( h, (int16_t*)dct,
636 h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
637 x264_dct4_weight2_zigzag[h->mb.b_interlaced],
638 x264_zigzag_scan4[h->mb.b_interlaced],
639 i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
642 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
643 int i_qp, int b_intra, int idx )
645 quant_trellis_cabac( h, (int16_t*)dct,
646 h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
647 x264_dct8_weight2_zigzag[h->mb.b_interlaced],
648 x264_zigzag_scan8[h->mb.b_interlaced],
649 DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 0, 64, idx );