1 /*****************************************************************************
2 * rdo.c: h264 encoder library (rate-distortion optimization)
3 *****************************************************************************
4 * Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *****************************************************************************/
21 /* duplicate all the writer functions, just calculating bit cost
22 * instead of writing the bitstream.
23 * TODO: use these for fast 1st pass too. */
27 static uint8_t cabac_prefix_transition[15][128];
28 static uint16_t cabac_prefix_size[15][128];
30 /* CAVLC: produces exactly the same bit count as a normal encode */
31 /* this probably still leaves some unnecessary computations */
32 #define bs_write1(s,v) ((s)->i_bits_encoded += 1)
33 #define bs_write(s,n,v) ((s)->i_bits_encoded += (n))
34 #define bs_write_ue(s,v) ((s)->i_bits_encoded += bs_size_ue(v))
35 #define bs_write_se(s,v) ((s)->i_bits_encoded += bs_size_se(v))
36 #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
37 #define x264_macroblock_write_cavlc x264_macroblock_size_cavlc
40 /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
41 * fractional bits, but only finite precision. */
42 #undef x264_cabac_encode_decision
43 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
44 #define x264_cabac_encode_terminal(c) x264_cabac_size_decision(c,276,0)
45 #define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256)
46 #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
47 #define x264_cabac_encode_flush(h,c)
48 #define x264_macroblock_write_cabac x264_macroblock_size_cabac
51 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
52 sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
54 static int ssd_mb( x264_t *h )
56 return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
57 h->mb.pic.p_fdec[0], FDEC_STRIDE )
58 + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE,
59 h->mb.pic.p_fdec[1], FDEC_STRIDE )
60 + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE,
61 h->mb.pic.p_fdec[2], FDEC_STRIDE );
64 static int ssd_plane( x264_t *h, int size, int p, int x, int y )
66 return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
67 h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
70 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
72 int b_transform_bak = h->mb.b_transform_8x8;
76 x264_macroblock_encode( h );
80 if( IS_SKIP( h->mb.i_type ) )
82 i_bits = (1 * i_lambda2 + 128) >> 8;
84 else if( h->param.b_cabac )
86 x264_cabac_t cabac_tmp;
88 x264_macroblock_size_cabac( h, &cabac_tmp );
89 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
93 bs_t bs_tmp = h->out.bs;
94 bs_tmp.i_bits_encoded = 0;
95 x264_macroblock_size_cavlc( h, &bs_tmp );
96 i_bits = ( bs_tmp.i_bits_encoded * i_lambda2 + 128 ) >> 8;
99 h->mb.b_transform_8x8 = b_transform_bak;
101 return i_ssd + i_bits;
104 int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
108 if( i_pixel == PIXEL_16x16 )
110 int type_bak = h->mb.i_type;
111 int i_cost = x264_rd_cost_mb( h, i_lambda2 );
112 h->mb.i_type = type_bak;
116 x264_macroblock_encode_p8x8( h, i8 );
117 if( i_pixel == PIXEL_16x8 )
118 x264_macroblock_encode_p8x8( h, i8+1 );
119 if( i_pixel == PIXEL_8x16 )
120 x264_macroblock_encode_p8x8( h, i8+2 );
122 i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 )
123 + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
124 + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
126 if( h->param.b_cabac )
128 x264_cabac_t cabac_tmp;
130 x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
131 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
135 i_bits = ( x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2 + 128 ) >> 8;
138 return i_ssd + i_bits;
141 int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
145 x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
146 i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
148 if( h->param.b_cabac )
150 x264_cabac_t cabac_tmp;
152 x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
153 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
157 i_bits = ( x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2 + 128 ) >> 8;
160 return i_ssd + i_bits;
163 int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
167 x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
168 i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
170 if( h->param.b_cabac )
172 x264_cabac_t cabac_tmp;
174 x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
175 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
179 i_bits = ( x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2 + 128 ) >> 8;
182 return i_ssd + i_bits;
185 int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
190 x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
191 i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
192 ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
194 h->mb.i_chroma_pred_mode = i_mode;
196 if( h->param.b_cabac )
198 x264_cabac_t cabac_tmp;
200 x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
201 i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
205 i_bits = ( x264_i8x8_chroma_size_cavlc( h ) * i_lambda2 + 128 ) >> 8;
208 return i_ssd + i_bits;
210 /****************************************************************************
211 * Trellis RD quantization
212 ****************************************************************************/
214 #define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
215 #define CABAC_SIZE_BITS 8
216 #define SSD_WEIGHT_BITS 5
217 #define LAMBDA_BITS 4
219 /* precalculate the cost of coding abs_level_m1 */
220 void x264_rdo_init( )
224 for( i_prefix = 0; i_prefix < 15; i_prefix++ )
226 for( i_ctx = 0; i_ctx < 128; i_ctx++ )
232 for( i = 1; i < i_prefix; i++ )
233 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
234 if( i_prefix > 0 && i_prefix < 14 )
235 f8_bits += x264_cabac_size_decision2( &ctx, 0 );
236 f8_bits += 1 << CABAC_SIZE_BITS; //sign
238 cabac_prefix_size[i_prefix][i_ctx] = f8_bits;
239 cabac_prefix_transition[i_prefix][i_ctx] = ctx;
244 // should the intra and inter lambdas be different?
245 // I'm just matching the behaviour of deadzone quant.
246 static const int lambda2_tab[2][52] = {
247 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
248 { 46, 58, 73, 92, 117, 147,
249 185, 233, 294, 370, 466, 587,
250 740, 932, 1174, 1480, 1864, 2349,
251 2959, 3728, 4697, 5918, 7457, 9395,
252 11837, 14914, 18790, 23674, 29828, 37581,
253 47349, 59656, 75163, 94699, 119313, 150326,
254 189399, 238627, 300652, 378798, 477255, 601304,
255 757596, 954511, 1202608, 1515192, 1909022, 2405217,
256 3030384, 3818045, 4810435, 6060769 },
257 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
258 { 27, 34, 43, 54, 68, 86,
259 108, 136, 172, 216, 273, 343,
260 433, 545, 687, 865, 1090, 1374,
261 1731, 2180, 2747, 3461, 4361, 5494,
262 6922, 8721, 10988, 13844, 17442, 21976,
263 27688, 34885, 43953, 55377, 69771, 87906,
264 110755, 139543, 175813, 221511, 279087, 351627,
265 443023, 558174, 703255, 886046, 1116348, 1406511,
266 1772093, 2232697, 2813022, 3544186 }
271 int level_idx; // index into level_tree[]
272 uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
276 // support chroma and i16x16 DC
277 // save cabac state between blocks?
278 // use trellis' RD score instead of x264_mb_decimate_score?
279 // code 8x8 sig/last flags forwards with deadzone and save the contexts at
281 // change weights when using CQMs?
283 // possible optimizations:
284 // make scores fit in 32bit
285 // save quantized coefs during rd, to avoid a duplicate trellis in the final encode
286 // if trellissing all MBRD modes, finish SSD calculation so we can skip all of
287 // the normal dequant/idct/ssd/cabac
289 // the unquant_mf here is not the same as dequant_mf:
290 // in normal operation (dct->quant->dequant->idct) the dct and idct are not
291 // normalized. quant/dequant absorb those scaling factors.
292 // in this function, we just do (quant->unquant) and want the output to be
293 // comparable to the input. so unquant is the direct inverse of quant,
294 // and uses the dct scaling factors, not the idct ones.
296 static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
297 const uint16_t *quant_mf, const int *unquant_mf,
298 const int *coef_weight, const uint8_t *zigzag,
299 int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs )
301 int abs_coefs[64], signs[64];
302 trellis_node_t nodes[2][8];
303 trellis_node_t *nodes_cur = nodes[0];
304 trellis_node_t *nodes_prev = nodes[1];
305 trellis_node_t *bnode;
306 uint8_t cabac_state_sig[64];
307 uint8_t cabac_state_last[64];
308 const int b_interlaced = h->mb.b_interlaced;
309 const int f = 1 << 15; // no deadzone
313 // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
314 // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
315 // but it takes more time to remove dead states than you gain in reduced memory.
319 } level_tree[64*8*2];
320 int i_levels_used = 1;
323 for( i = i_coefs-1; i >= b_ac; i-- )
324 if( (unsigned)(dct[zigzag[i]] * quant_mf[zigzag[i]] + f-1) >= 2*f )
329 memset( dct, 0, i_coefs * sizeof(*dct) );
335 for( ; i >= b_ac; i-- )
337 int coef = dct[zigzag[i]];
338 abs_coefs[i] = abs(coef);
339 signs[i] = coef < 0 ? -1 : 1;
343 for( i = 1; i < 8; i++ )
344 nodes_cur[i].score = TRELLIS_SCORE_MAX;
345 nodes_cur[0].score = 0;
346 nodes_cur[0].level_idx = 0;
347 level_tree[0].abs_level = 0;
348 level_tree[0].next = 0;
350 // coefs are processed in reverse order, because that's how the abs value is coded.
351 // last_coef and significant_coef flags are normally coded in forward order, but
352 // we have to reverse them to match the levels.
353 // in 4x4 blocks, last_coef and significant_coef use a separate context for each
354 // position, so the order doesn't matter, and we don't even have to update their contexts.
355 // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
356 // cabac isn't too sensitive.
360 const uint8_t *ctx_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
361 const uint8_t *ctx_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
362 for( i = 0; i < 63; i++ )
364 cabac_state_sig[i] = ctx_sig[ significant_coeff_flag_offset_8x8[b_interlaced][i] ];
365 cabac_state_last[i] = ctx_last[ last_coeff_flag_offset_8x8[i] ];
370 memcpy( cabac_state_sig, &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
371 memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
373 memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );
375 for( i = i_last_nnz; i >= b_ac; i-- )
377 int i_coef = abs_coefs[i];
378 int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> 16;
380 int cost_sig[2], cost_last[2];
383 // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
386 // no need to calculate ssd of 0s: it's the same in all nodes.
387 // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
388 const uint32_t cost_sig0 = x264_cabac_size_decision_noup( &cabac_state_sig[i], 0 )
389 * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
390 for( j = 1; j < 8; j++ )
392 if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
394 #define SET_LEVEL(n,l) \
395 level_tree[i_levels_used].abs_level = l; \
396 level_tree[i_levels_used].next = n.level_idx; \
397 n.level_idx = i_levels_used; \
400 SET_LEVEL( nodes_cur[j], 0 );
401 nodes_cur[j].score += cost_sig0;
407 XCHG( trellis_node_t*, nodes_cur, nodes_prev );
409 for( j = 0; j < 8; j++ )
410 nodes_cur[j].score = TRELLIS_SCORE_MAX;
414 cost_sig[0] = x264_cabac_size_decision_noup( &cabac_state_sig[i], 0 );
415 cost_sig[1] = x264_cabac_size_decision_noup( &cabac_state_sig[i], 1 );
416 cost_last[0] = x264_cabac_size_decision_noup( &cabac_state_last[i], 0 );
417 cost_last[1] = x264_cabac_size_decision_noup( &cabac_state_last[i], 1 );
421 cost_sig[0] = cost_sig[1] = 0;
422 cost_last[0] = cost_last[1] = 0;
425 // there are a few cases where increasing the coeff magnitude helps,
426 // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
427 // could also try q-2: that sometimes helps, but also sometimes decimates blocks
428 // that are better left coded, especially at QP > 40.
429 for( abs_level = q; abs_level >= q-1; abs_level-- )
431 int d = i_coef - ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
432 uint64_t ssd = (int64_t)d*d * coef_weight[i];
434 for( j = 0; j < 8; j++ )
437 if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
441 /* code the proposed level, and count how much entropy it would take */
442 if( abs_level || node_ctx )
444 unsigned f8_bits = cost_sig[ abs_level != 0 ];
447 const int i_prefix = X264_MIN( abs_level - 1, 14 );
448 f8_bits += cost_last[ node_ctx == 0 ];
449 f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
452 uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
453 f8_bits += cabac_prefix_size[i_prefix][*ctx];
454 *ctx = cabac_prefix_transition[i_prefix][*ctx];
455 if( abs_level >= 15 )
456 f8_bits += bs_size_ue( abs_level - 15 ) << CABAC_SIZE_BITS;
457 node_ctx = coeff_abs_level_transition[1][node_ctx];
461 f8_bits += 1 << CABAC_SIZE_BITS;
462 node_ctx = coeff_abs_level_transition[0][node_ctx];
465 n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
470 /* save the node if it's better than any existing node with the same cabac ctx */
471 if( n.score < nodes_cur[node_ctx].score )
473 SET_LEVEL( n, abs_level );
474 nodes_cur[node_ctx] = n;
480 /* output levels from the best path through the trellis */
481 bnode = &nodes_cur[0];
482 for( j = 1; j < 8; j++ )
483 if( nodes_cur[j].score < bnode->score )
484 bnode = &nodes_cur[j];
486 j = bnode->level_idx;
487 for( i = b_ac; i < i_coefs; i++ )
489 dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
490 j = level_tree[j].next;
495 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
496 int i_qp, int i_ctxBlockCat, int b_intra )
498 int b_ac = (i_ctxBlockCat == DCT_LUMA_AC);
499 quant_trellis_cabac( h, (int16_t*)dct,
500 h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
501 x264_dct4_weight2_zigzag[h->mb.b_interlaced],
502 x264_zigzag_scan4[h->mb.b_interlaced],
503 i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 );
507 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
508 int i_qp, int b_intra )
510 quant_trellis_cabac( h, (int16_t*)dct,
511 h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
512 x264_dct8_weight2_zigzag[h->mb.b_interlaced],
513 x264_zigzag_scan8[h->mb.b_interlaced],
514 DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 );