1 /*****************************************************************************
2 * rdo.c: h264 encoder library (rate-distortion optimization)
3 *****************************************************************************
4 * Copyright (C) 2005 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21 *****************************************************************************/
23 /* duplicate all the writer functions, just calculating bit cost
24 * instead of writing the bitstream.
25 * TODO: use these for fast 1st pass too. */
29 static int cabac_prefix_transition[15][128];
30 static int cabac_prefix_size[15][128];
32 /* CAVLC: produces exactly the same bit count as a normal encode */
33 /* this probably still leaves some unnecessary computations */
34 #define bs_write1(s,v) ((s)->i_bits_encoded += 1)
35 #define bs_write(s,n,v) ((s)->i_bits_encoded += (n))
36 #define bs_write_ue(s,v) ((s)->i_bits_encoded += bs_size_ue(v))
37 #define bs_write_se(s,v) ((s)->i_bits_encoded += bs_size_se(v))
38 #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
39 #define x264_macroblock_write_cavlc x264_macroblock_size_cavlc
42 /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
43 * fractional bits, but only finite precision. */
44 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
45 #define x264_cabac_encode_terminal(c) x264_cabac_size_decision(c,276,0)
46 #define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256)
47 #define x264_cabac_encode_flush(h,c)
48 #define x264_macroblock_write_cabac x264_macroblock_size_cabac
49 #define x264_cabac_mb_skip x264_cabac_mb_size_skip_unused
53 static int ssd_mb( x264_t *h )
55 return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
56 h->mb.pic.p_fdec[0], FDEC_STRIDE )
57 + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE,
58 h->mb.pic.p_fdec[1], FDEC_STRIDE )
59 + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE,
60 h->mb.pic.p_fdec[2], FDEC_STRIDE );
63 static int ssd_plane( x264_t *h, int size, int p, int x, int y )
65 return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
66 h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
69 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
71 int b_transform_bak = h->mb.b_transform_8x8;
75 x264_macroblock_encode( h );
79 if( IS_SKIP( h->mb.i_type ) )
81 i_bits = 1 * i_lambda2;
83 else if( h->param.b_cabac )
85 x264_cabac_t cabac_tmp = h->cabac;
86 cabac_tmp.f8_bits_encoded = 0;
87 x264_macroblock_size_cabac( h, &cabac_tmp );
88 i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
92 bs_t bs_tmp = h->out.bs;
93 bs_tmp.i_bits_encoded = 0;
94 x264_macroblock_size_cavlc( h, &bs_tmp );
95 i_bits = bs_tmp.i_bits_encoded * i_lambda2;
98 h->mb.b_transform_8x8 = b_transform_bak;
100 return i_ssd + i_bits;
103 int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
107 if( i_pixel == PIXEL_16x16 )
109 int type_bak = h->mb.i_type;
110 int i_cost = x264_rd_cost_mb( h, i_lambda2 );
111 h->mb.i_type = type_bak;
115 x264_macroblock_encode_p8x8( h, i8 );
116 if( i_pixel == PIXEL_16x8 )
117 x264_macroblock_encode_p8x8( h, i8+1 );
118 if( i_pixel == PIXEL_8x16 )
119 x264_macroblock_encode_p8x8( h, i8+2 );
121 i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 )
122 + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
123 + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
125 if( h->param.b_cabac )
127 x264_cabac_t cabac_tmp = h->cabac;
128 cabac_tmp.f8_bits_encoded = 0;
129 x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
130 i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
134 i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
137 return i_ssd + i_bits;
140 int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
144 x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
145 i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
147 if( h->param.b_cabac )
149 x264_cabac_t cabac_tmp = h->cabac;
150 cabac_tmp.f8_bits_encoded = 0;
151 x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
152 i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
156 i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
159 return i_ssd + i_bits;
162 int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
166 x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
167 i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
169 if( h->param.b_cabac )
171 x264_cabac_t cabac_tmp = h->cabac;
172 cabac_tmp.f8_bits_encoded = 0;
173 x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
174 i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
178 i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
181 return i_ssd + i_bits;
184 int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
189 x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
190 i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
191 ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
193 h->mb.i_chroma_pred_mode = i_mode;
195 if( h->param.b_cabac )
197 x264_cabac_t cabac_tmp = h->cabac;
198 cabac_tmp.f8_bits_encoded = 0;
199 x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
200 i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
204 i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
207 return i_ssd + i_bits;
209 /****************************************************************************
210 * Trellis RD quantization
211 ****************************************************************************/
213 #define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
214 #define CABAC_SIZE_BITS 8
215 #define SSD_WEIGHT_BITS 5
216 #define LAMBDA_BITS 4
218 /* precalculate the cost of coding abs_level_m1 */
219 void x264_rdo_init( )
223 for( i_prefix = 0; i_prefix < 15; i_prefix++ )
225 for( i_ctx = 0; i_ctx < 128; i_ctx++ )
231 for( i = 1; i < i_prefix; i++ )
232 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
233 if( i_prefix > 0 && i_prefix < 14 )
234 f8_bits += x264_cabac_size_decision2( &ctx, 0 );
235 f8_bits += 1 << CABAC_SIZE_BITS; //sign
237 cabac_prefix_size[i_prefix][i_ctx] = f8_bits;
238 cabac_prefix_transition[i_prefix][i_ctx] = ctx;
243 // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
244 // 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
245 /* map node ctx => cabac ctx for level=1 */
246 static const int coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
247 /* map node ctx => cabac ctx for level>1 */
248 static const int coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
249 static const int coeff_abs_level_transition[2][8] = {
250 /* update node.ctx after coding a level=1 */
251 { 1, 2, 3, 3, 4, 5, 6, 7 },
252 /* update node.ctx after coding a level>1 */
253 { 4, 4, 4, 4, 5, 6, 7, 7 }
256 // should the intra and inter lambdas be different?
257 // I'm just matching the behaviour of deadzone quant.
258 static const int lambda2_tab[2][52] = {
259 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
260 { 46, 58, 73, 92, 117, 147,
261 185, 233, 294, 370, 466, 587,
262 740, 932, 1174, 1480, 1864, 2349,
263 2959, 3728, 4697, 5918, 7457, 9395,
264 11837, 14914, 18790, 23674, 29828, 37581,
265 47349, 59656, 75163, 94699, 119313, 150326,
266 189399, 238627, 300652, 378798, 477255, 601304,
267 757596, 954511, 1202608, 1515192, 1909022, 2405217,
268 3030384, 3818045, 4810435, 6060769 },
269 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
270 { 27, 34, 43, 54, 68, 86,
271 108, 136, 172, 216, 273, 343,
272 433, 545, 687, 865, 1090, 1374,
273 1731, 2180, 2747, 3461, 4361, 5494,
274 6922, 8721, 10988, 13844, 17442, 21976,
275 27688, 34885, 43953, 55377, 69771, 87906,
276 110755, 139543, 175813, 221511, 279087, 351627,
277 443023, 558174, 703255, 886046, 1116348, 1406511,
278 1772093, 2232697, 2813022, 3544186 }
283 int level_idx; // index into level_tree[]
284 uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
288 // support chroma and i16x16 DC
289 // save cabac state between blocks?
290 // use trellis' RD score instead of x264_mb_decimate_score?
291 // code 8x8 sig/last flags forwards with deadzone and save the contexts at
293 // change weights when using CQMs?
295 // possible optimizations:
296 // make scores fit in 32bit
297 // save quantized coefs during rd, to avoid a duplicate trellis in the final encode
298 // if trellissing all MBRD modes, finish SSD calculation so we can skip all of
299 // the normal dequant/idct/ssd/cabac
301 // the unquant_mf here is not the same as dequant_mf:
302 // in normal operation (dct->quant->dequant->idct) the dct and idct are not
303 // normalized. quant/dequant absorb those scaling factors.
304 // in this function, we just do (quant->unquant) and want the output to be
305 // comparable to the input. so unquant is the direct inverse of quant,
306 // and uses the dct scaling factors, not the idct ones.
308 static void quant_trellis_cabac( x264_t *h, int16_t *dct,
309 const uint16_t *quant_mf, const int *unquant_mf,
310 const int *coef_weight, const uint8_t *zigzag,
311 int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs )
313 int abs_coefs[64], signs[64];
314 trellis_node_t nodes[2][8];
315 trellis_node_t *nodes_cur = nodes[0];
316 trellis_node_t *nodes_prev = nodes[1];
317 trellis_node_t *bnode;
318 uint8_t cabac_state_sig[64];
319 uint8_t cabac_state_last[64];
320 const int b_interlaced = h->mb.b_interlaced;
321 const int f = 1 << 15; // no deadzone
325 // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
326 // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
327 // but it takes more time to remove dead states than you gain in reduced memory.
331 } level_tree[64*8*2];
332 int i_levels_used = 1;
335 for( i = i_coefs-1; i >= b_ac; i-- )
336 if( (unsigned)(dct[zigzag[i]] * quant_mf[zigzag[i]] + f-1) >= 2*f )
341 memset( dct, 0, i_coefs * sizeof(*dct) );
347 for( ; i >= b_ac; i-- )
349 int coef = dct[zigzag[i]];
350 abs_coefs[i] = abs(coef);
351 signs[i] = coef < 0 ? -1 : 1;
355 for( i = 1; i < 8; i++ )
356 nodes_cur[i].score = TRELLIS_SCORE_MAX;
357 nodes_cur[0].score = 0;
358 nodes_cur[0].level_idx = 0;
359 level_tree[0].abs_level = 0;
360 level_tree[0].next = 0;
362 // coefs are processed in reverse order, because that's how the abs value is coded.
363 // last_coef and significant_coef flags are normally coded in forward order, but
364 // we have to reverse them to match the levels.
365 // in 4x4 blocks, last_coef and significant_coef use a separate context for each
366 // position, so the order doesn't matter, and we don't even have to update their contexts.
367 // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
368 // cabac isn't too sensitive.
372 const uint8_t *ctx_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
373 const uint8_t *ctx_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
374 for( i = 0; i < 63; i++ )
376 cabac_state_sig[i] = ctx_sig[ significant_coeff_flag_offset_8x8[b_interlaced][i] ];
377 cabac_state_last[i] = ctx_last[ last_coeff_flag_offset_8x8[i] ];
382 memcpy( cabac_state_sig, &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
383 memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
385 memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );
387 for( i = i_last_nnz; i >= b_ac; i-- )
389 int i_coef = abs_coefs[i];
390 int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> 16;
392 int cost_sig[2], cost_last[2];
395 // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
398 // no need to calculate ssd of 0s: it's the same in all nodes.
399 // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
400 const uint32_t cost_sig0 = x264_cabac_size_decision_noup( &cabac_state_sig[i], 0 )
401 * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
402 for( j = 1; j < 8; j++ )
404 if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
406 #define SET_LEVEL(n,l) \
407 level_tree[i_levels_used].abs_level = l; \
408 level_tree[i_levels_used].next = n.level_idx; \
409 n.level_idx = i_levels_used; \
412 SET_LEVEL( nodes_cur[j], 0 );
413 nodes_cur[j].score += cost_sig0;
419 XCHG( trellis_node_t*, nodes_cur, nodes_prev );
421 for( j = 0; j < 8; j++ )
422 nodes_cur[j].score = TRELLIS_SCORE_MAX;
426 cost_sig[0] = x264_cabac_size_decision_noup( &cabac_state_sig[i], 0 );
427 cost_sig[1] = x264_cabac_size_decision_noup( &cabac_state_sig[i], 1 );
428 cost_last[0] = x264_cabac_size_decision_noup( &cabac_state_last[i], 0 );
429 cost_last[1] = x264_cabac_size_decision_noup( &cabac_state_last[i], 1 );
433 cost_sig[0] = cost_sig[1] = 0;
434 cost_last[0] = cost_last[1] = 0;
437 // there are a few cases where increasing the coeff magnitude helps,
438 // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
439 // could also try q-2: that sometimes helps, but also sometimes decimates blocks
440 // that are better left coded, especially at QP > 40.
441 for( abs_level = q; abs_level >= q-1; abs_level-- )
443 int d = i_coef - ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
444 uint64_t ssd = (int64_t)d*d * coef_weight[i];
446 for( j = 0; j < 8; j++ )
449 if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
453 /* code the proposed level, and count how much entropy it would take */
454 if( abs_level || node_ctx )
456 unsigned f8_bits = cost_sig[ abs_level != 0 ];
459 const int i_prefix = X264_MIN( abs_level - 1, 14 );
460 f8_bits += cost_last[ node_ctx == 0 ];
461 f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
464 uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
465 f8_bits += cabac_prefix_size[i_prefix][*ctx];
466 *ctx = cabac_prefix_transition[i_prefix][*ctx];
467 if( abs_level >= 15 )
468 f8_bits += bs_size_ue( abs_level - 15 ) << CABAC_SIZE_BITS;
469 node_ctx = coeff_abs_level_transition[1][node_ctx];
473 f8_bits += 1 << CABAC_SIZE_BITS;
474 node_ctx = coeff_abs_level_transition[0][node_ctx];
477 n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
482 /* save the node if it's better than any existing node with the same cabac ctx */
483 if( n.score < nodes_cur[node_ctx].score )
485 SET_LEVEL( n, abs_level );
486 nodes_cur[node_ctx] = n;
492 /* output levels from the best path through the trellis */
493 bnode = &nodes_cur[0];
494 for( j = 1; j < 8; j++ )
495 if( nodes_cur[j].score < bnode->score )
496 bnode = &nodes_cur[j];
498 j = bnode->level_idx;
499 for( i = b_ac; i < i_coefs; i++ )
501 dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
502 j = level_tree[j].next;
507 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
508 int i_qp, int i_ctxBlockCat, int b_intra )
510 int b_ac = (i_ctxBlockCat == DCT_LUMA_AC);
511 quant_trellis_cabac( h, (int16_t*)dct,
512 h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
513 x264_dct4_weight2_zigzag[h->mb.b_interlaced],
514 x264_zigzag_scan4[h->mb.b_interlaced],
515 i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 );
519 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
520 int i_qp, int b_intra )
522 quant_trellis_cabac( h, (int16_t*)dct,
523 h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
524 x264_dct8_weight2_zigzag[h->mb.b_interlaced],
525 x264_zigzag_scan8[h->mb.b_interlaced],
526 DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 );