git.sesse.net Git - x264/blob - encoder/rdo.c

   1 /*****************************************************************************
   2  * rdo.c: h264 encoder library (rate-distortion optimization)
   3  *****************************************************************************
   4  * Copyright (C) 2005-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Fiona Glaser <fiona@x264.com>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 /* duplicate all the writer functions, just calculating bit cost
  25  * instead of writing the bitstream.
  26  * TODO: use these for fast 1st pass too. */
  27
  28 #define RDO_SKIP_BS 1
  29
  30 /* Transition and size tables for abs<9 MVD and residual coding */
  31 /* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
  32 static uint8_t cabac_transition_unary[15][128];
  33 static uint16_t cabac_size_unary[15][128];
  34 /* Transition and size tables for abs>9 MVD */
  35 /* Consist of 5 1s and a bypass sign bit */
  36 static uint8_t cabac_transition_5ones[128];
  37 static uint16_t cabac_size_5ones[128];
  38
  39 /* CAVLC: produces exactly the same bit count as a normal encode */
  40 /* this probably still leaves some unnecessary computations */
  41 #define bs_write1(s,v)     ((s)->i_bits_encoded += 1)
  42 #define bs_write(s,n,v)    ((s)->i_bits_encoded += (n))
  43 #define bs_write_ue(s,v)   ((s)->i_bits_encoded += bs_size_ue(v))
  44 #define bs_write_se(s,v)   ((s)->i_bits_encoded += bs_size_se(v))
  45 #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
  46 #define x264_macroblock_write_cavlc  static x264_macroblock_size_cavlc
  47 #include "cavlc.c"
  48
  49 /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
  50  * fractional bits, but only finite precision. */
  51 #undef  x264_cabac_encode_decision
  52 #undef  x264_cabac_encode_decision_noup
  53 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
  54 #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
  55 #define x264_cabac_encode_terminal(c)     x264_cabac_size_decision_noup(c,276,0)
  56 #define x264_cabac_encode_bypass(c,v)     ((c)->f8_bits_encoded += 256)
  57 #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
  58 #define x264_cabac_encode_flush(h,c)
  59 #define x264_macroblock_write_cabac  static x264_macroblock_size_cabac
  60 #include "cabac.c"
  61
  62 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
  63         sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
  64
  65
  66 /* Sum the cached SATDs to avoid repeating them. */
  67 static inline int sum_satd( x264_t *h, int pixel, int x, int y )
  68 {
  69     int satd = 0;
  70     int min_x = x>>2;
  71     int min_y = y>>2;
  72     int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
  73     int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
  74     if( pixel == PIXEL_16x16 )
  75         return h->mb.pic.fenc_satd_sum;
  76     for( y = min_y; y < max_y; y++ )
  77         for( x = min_x; x < max_x; x++ )
  78             satd += h->mb.pic.fenc_satd[y][x];
  79     return satd;
  80 }
  81
  82 static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
  83 {
  84     int sa8d = 0;
  85     int min_x = x>>3;
  86     int min_y = y>>3;
  87     int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
  88     int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
  89     if( pixel == PIXEL_16x16 )
  90         return h->mb.pic.fenc_sa8d_sum;
  91     for( y = min_y; y < max_y; y++ )
  92         for( x = min_x; x < max_x; x++ )
  93             sa8d += h->mb.pic.fenc_sa8d[y][x];
  94     return sa8d;
  95 }
  96
  97 /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
  98 /* SATD and SA8D are used to measure block complexity. */
  99 /* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size.  Using SATD */
 100 /* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
 101
 102 /* FIXME:  Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
 103 /* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
 104 /* This optimization can also be used in non-RD transform decision. */
 105
 106 static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
 107 {
 108     DECLARE_ALIGNED_16(static uint8_t zero[16]);
 109     int satd = 0;
 110     uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
 111     uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
 112     if( p == 0 && h->mb.i_psy_rd )
 113     {
 114         /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
 115         if( size <= PIXEL_8x8 )
 116         {
 117             uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
 118             satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
 119                  + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
 120             satd >>= 1;
 121         }
 122         else
 123         {
 124             int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
 125             satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
 126         }
 127         satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
 128     }
 129     return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
 130 }
 131
 132 static inline int ssd_mb( x264_t *h )
 133 {
 134     return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
 135          + ssd_plane(h, PIXEL_8x8,   1, 0, 0)
 136          + ssd_plane(h, PIXEL_8x8,   2, 0, 0);
 137 }
 138
 139 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
 140 {
 141     int b_transform_bak = h->mb.b_transform_8x8;
 142     int i_ssd;
 143     int i_bits;
 144
 145     x264_macroblock_encode( h );
 146
 147     i_ssd = ssd_mb( h );
 148
 149     if( IS_SKIP( h->mb.i_type ) )
 150     {
 151         i_bits = (1 * i_lambda2 + 128) >> 8;
 152     }
 153     else if( h->param.b_cabac )
 154     {
 155         x264_cabac_t cabac_tmp;
 156         COPY_CABAC;
 157         x264_macroblock_size_cabac( h, &cabac_tmp );
 158         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
 159     }
 160     else
 161     {
 162         bs_t bs_tmp = h->out.bs;
 163         bs_tmp.i_bits_encoded = 0;
 164         x264_macroblock_size_cavlc( h, &bs_tmp );
 165         i_bits = ( bs_tmp.i_bits_encoded * i_lambda2 + 128 ) >> 8;
 166     }
 167
 168     h->mb.b_transform_8x8 = b_transform_bak;
 169
 170     return i_ssd + i_bits;
 171 }
 172
 173 /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
 174
 175 static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
 176 {
 177     uint64_t i_ssd, i_bits;
 178
 179     x264_macroblock_encode_p4x4( h, i4 );
 180     if( i_pixel == PIXEL_8x4 )
 181         x264_macroblock_encode_p4x4( h, i4+1 );
 182     if( i_pixel == PIXEL_4x8 )
 183         x264_macroblock_encode_p4x4( h, i4+2 );
 184
 185     i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
 186
 187     if( h->param.b_cabac )
 188     {
 189         x264_cabac_t cabac_tmp;
 190         COPY_CABAC;
 191         x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
 192         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 193     }
 194     else
 195     {
 196         i_bits = x264_subpartition_size_cavlc( h, i4, i_pixel );
 197     }
 198
 199     return (i_ssd<<8) + i_bits;
 200 }
 201
 202 uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
 203 {
 204     uint64_t i_ssd, i_bits;
 205     int i8 = i4 >> 2;
 206
 207     if( i_pixel == PIXEL_16x16 )
 208     {
 209         int type_bak = h->mb.i_type;
 210         int i_cost = x264_rd_cost_mb( h, i_lambda2 );
 211         h->mb.i_type = type_bak;
 212         return i_cost;
 213     }
 214
 215     if( i_pixel > PIXEL_8x8 )
 216         return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );
 217
 218     x264_macroblock_encode_p8x8( h, i8 );
 219     if( i_pixel == PIXEL_16x8 )
 220         x264_macroblock_encode_p8x8( h, i8+1 );
 221     if( i_pixel == PIXEL_8x16 )
 222         x264_macroblock_encode_p8x8( h, i8+2 );
 223
 224     i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 )
 225           + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
 226           + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
 227
 228     if( h->param.b_cabac )
 229     {
 230         x264_cabac_t cabac_tmp;
 231         COPY_CABAC;
 232         x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
 233         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 234     }
 235     else
 236     {
 237         i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
 238     }
 239
 240     return (i_ssd<<8) + i_bits;
 241 }
 242
 243 static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
 244 {
 245     uint64_t i_ssd, i_bits;
 246
 247     x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
 248     i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
 249
 250     if( h->param.b_cabac )
 251     {
 252         x264_cabac_t cabac_tmp;
 253         COPY_CABAC;
 254         x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
 255         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 256     }
 257     else
 258     {
 259         i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
 260     }
 261
 262     return (i_ssd<<8) + i_bits;
 263 }
 264
 265 static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
 266 {
 267     uint64_t i_ssd, i_bits;
 268
 269     x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
 270     i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
 271
 272     if( h->param.b_cabac )
 273     {
 274         x264_cabac_t cabac_tmp;
 275         COPY_CABAC;
 276         x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
 277         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 278     }
 279     else
 280     {
 281         i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
 282     }
 283
 284     return (i_ssd<<8) + i_bits;
 285 }
 286
 287 static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
 288 {
 289     uint64_t i_ssd, i_bits;
 290
 291     if( b_dct )
 292         x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
 293     i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
 294             ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
 295
 296     h->mb.i_chroma_pred_mode = i_mode;
 297
 298     if( h->param.b_cabac )
 299     {
 300         x264_cabac_t cabac_tmp;
 301         COPY_CABAC;
 302         x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
 303         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 304     }
 305     else
 306     {
 307         i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
 308     }
 309
 310     return (i_ssd<<8) + i_bits;
 311 }
 312 /****************************************************************************
 313  * Trellis RD quantization
 314  ****************************************************************************/
 315
 316 #define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
 317 #define CABAC_SIZE_BITS 8
 318 #define SSD_WEIGHT_BITS 5
 319 #define LAMBDA_BITS 4
 320
 321 /* precalculate the cost of coding various combinations of bits in a single context */
 322 void x264_rdo_init( void )
 323 {
 324     int i_prefix, i_ctx, i;
 325     for( i_prefix = 0; i_prefix < 15; i_prefix++ )
 326     {
 327         for( i_ctx = 0; i_ctx < 128; i_ctx++ )
 328         {
 329             int f8_bits = 0;
 330             uint8_t ctx = i_ctx;
 331
 332             for( i = 1; i < i_prefix; i++ )
 333                 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
 334             if( i_prefix > 0 && i_prefix < 14 )
 335                 f8_bits += x264_cabac_size_decision2( &ctx, 0 );
 336             f8_bits += 1 << CABAC_SIZE_BITS; //sign
 337
 338             cabac_size_unary[i_prefix][i_ctx] = f8_bits;
 339             cabac_transition_unary[i_prefix][i_ctx] = ctx;
 340         }
 341     }
 342     for( i_ctx = 0; i_ctx < 128; i_ctx++ )
 343     {
 344         int f8_bits = 0;
 345         uint8_t ctx = i_ctx;
 346
 347         for( i = 0; i < 5; i++ )
 348             f8_bits += x264_cabac_size_decision2( &ctx, 1 );
 349         f8_bits += 1 << CABAC_SIZE_BITS; //sign
 350
 351         cabac_size_5ones[i_ctx] = f8_bits;
 352         cabac_transition_5ones[i_ctx] = ctx;
 353     }
 354 }
 355
 356 // should the intra and inter lambdas be different?
 357 // I'm just matching the behaviour of deadzone quant.
 358 static const int lambda2_tab[2][52] = {
 359     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 360     {    46,      58,      73,      92,     117,     147,
 361         185,     233,     294,     370,     466,     587,
 362         740,     932,    1174,    1480,    1864,    2349,
 363        2959,    3728,    4697,    5918,    7457,    9395,
 364       11837,   14914,   18790,   23674,   29828,   37581,
 365       47349,   59656,   75163,   94699,  119313,  150326,
 366      189399,  238627,  300652,  378798,  477255,  601304,
 367      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 368     3030384, 3818045, 4810435, 6060769 },
 369     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 370     {    27,      34,      43,      54,      68,      86,
 371         108,     136,     172,     216,     273,     343,
 372         433,     545,     687,     865,    1090,    1374,
 373        1731,    2180,    2747,    3461,    4361,    5494,
 374        6922,    8721,   10988,   13844,   17442,   21976,
 375       27688,   34885,   43953,   55377,   69771,   87906,
 376      110755,  139543,  175813,  221511,  279087,  351627,
 377      443023,  558174,  703255,  886046, 1116348, 1406511,
 378     1772093, 2232697, 2813022, 3544186 }
 379 };
 380
 381 typedef struct {
 382     int64_t score;
 383     int level_idx; // index into level_tree[]
 384     uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
 385 } trellis_node_t;
 386
 387 // TODO:
 388 // save cabac state between blocks?
 389 // use trellis' RD score instead of x264_mb_decimate_score?
 390 // code 8x8 sig/last flags forwards with deadzone and save the contexts at
 391 //   each position?
 392 // change weights when using CQMs?
 393
 394 // possible optimizations:
 395 // make scores fit in 32bit
 396 // save quantized coefs during rd, to avoid a duplicate trellis in the final encode
 397 // if trellissing all MBRD modes, finish SSD calculation so we can skip all of
 398 //   the normal dequant/idct/ssd/cabac
 399
 400 // the unquant_mf here is not the same as dequant_mf:
 401 // in normal operation (dct->quant->dequant->idct) the dct and idct are not
 402 // normalized. quant/dequant absorb those scaling factors.
 403 // in this function, we just do (quant->unquant) and want the output to be
 404 // comparable to the input. so unquant is the direct inverse of quant,
 405 // and uses the dct scaling factors, not the idct ones.
 406
 407 static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
 408                                  const uint16_t *quant_mf, const int *unquant_mf,
 409                                  const int *coef_weight, const uint8_t *zigzag,
 410                                  int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
 411 {
 412     int abs_coefs[64], signs[64];
 413     trellis_node_t nodes[2][8];
 414     trellis_node_t *nodes_cur = nodes[0];
 415     trellis_node_t *nodes_prev = nodes[1];
 416     trellis_node_t *bnode;
 417     uint8_t cabac_state_sig[64];
 418     uint8_t cabac_state_last[64];
 419     const int b_interlaced = h->mb.b_interlaced;
 420     const int f = 1 << 15; // no deadzone
 421     int i_last_nnz;
 422     int i, j;
 423
 424     // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
 425     // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
 426     // but it takes more time to remove dead states than you gain in reduced memory.
 427     struct {
 428         uint16_t abs_level;
 429         uint16_t next;
 430     } level_tree[64*8*2];
 431     int i_levels_used = 1;
 432
 433     /* init coefs */
 434     for( i = i_coefs-1; i >= b_ac; i-- )
 435         if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
 436             break;
 437
 438     if( i < b_ac )
 439     {
 440         memset( dct, 0, i_coefs * sizeof(*dct) );
 441         return;
 442     }
 443
 444     i_last_nnz = i;
 445
 446     for( ; i >= b_ac; i-- )
 447     {
 448         int coef = dct[zigzag[i]];
 449         abs_coefs[i] = abs(coef);
 450         signs[i] = coef < 0 ? -1 : 1;
 451     }
 452
 453     /* init trellis */
 454     for( i = 1; i < 8; i++ )
 455         nodes_cur[i].score = TRELLIS_SCORE_MAX;
 456     nodes_cur[0].score = 0;
 457     nodes_cur[0].level_idx = 0;
 458     level_tree[0].abs_level = 0;
 459     level_tree[0].next = 0;
 460
 461     // coefs are processed in reverse order, because that's how the abs value is coded.
 462     // last_coef and significant_coef flags are normally coded in forward order, but
 463     // we have to reverse them to match the levels.
 464     // in 4x4 blocks, last_coef and significant_coef use a separate context for each
 465     // position, so the order doesn't matter, and we don't even have to update their contexts.
 466     // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
 467     // cabac isn't too sensitive.
 468
 469     if( i_coefs == 64 )
 470     {
 471         const uint8_t *ctx_sig  = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
 472         const uint8_t *ctx_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
 473         for( i = 0; i < 63; i++ )
 474         {
 475             cabac_state_sig[i]  = ctx_sig[ significant_coeff_flag_offset_8x8[b_interlaced][i] ];
 476             cabac_state_last[i] = ctx_last[ last_coeff_flag_offset_8x8[i] ];
 477         }
 478     }
 479     else if( !dc || i_ctxBlockCat != DCT_CHROMA_DC )
 480     {
 481         memcpy( cabac_state_sig,  &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
 482         memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
 483     }
 484     else
 485     {
 486         memcpy( cabac_state_sig,  &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
 487         memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
 488     }
 489     memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );
 490
 491     for( i = i_last_nnz; i >= b_ac; i-- )
 492     {
 493         int i_coef = abs_coefs[i];
 494         int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16;
 495         int abs_level;
 496         int cost_sig[2], cost_last[2];
 497         trellis_node_t n;
 498
 499         // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
 500         if( q == 0 )
 501         {
 502             // no need to calculate ssd of 0s: it's the same in all nodes.
 503             // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
 504             const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 0 )
 505                                      * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 506             for( j = 1; j < 8; j++ )
 507             {
 508                 if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
 509                 {
 510 #define SET_LEVEL(n,l) \
 511                     level_tree[i_levels_used].abs_level = l; \
 512                     level_tree[i_levels_used].next = n.level_idx; \
 513                     n.level_idx = i_levels_used; \
 514                     i_levels_used++;
 515
 516                     SET_LEVEL( nodes_cur[j], 0 );
 517                     nodes_cur[j].score += cost_sig0;
 518                 }
 519             }
 520             continue;
 521         }
 522
 523         XCHG( trellis_node_t*, nodes_cur, nodes_prev );
 524
 525         for( j = 0; j < 8; j++ )
 526             nodes_cur[j].score = TRELLIS_SCORE_MAX;
 527
 528         if( i < i_coefs-1 )
 529         {
 530             cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 0 );
 531             cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 1 );
 532             cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[i], 0 );
 533             cost_last[1] = x264_cabac_size_decision_noup2( &cabac_state_last[i], 1 );
 534         }
 535         else
 536         {
 537             cost_sig[0] = cost_sig[1] = 0;
 538             cost_last[0] = cost_last[1] = 0;
 539         }
 540
 541         // there are a few cases where increasing the coeff magnitude helps,
 542         // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
 543         // could also try q-2: that sometimes helps, but also sometimes decimates blocks
 544         // that are better left coded, especially at QP > 40.
 545         for( abs_level = q; abs_level >= q-1; abs_level-- )
 546         {
 547             int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);
 548             int d = i_coef - unquant_abs_level;
 549             int64_t ssd;
 550             /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
 551             if( h->mb.i_psy_trellis && i && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
 552             {
 553                 int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
 554                 int predicted_coef = orig_coef - i_coef * signs[i];
 555                 int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
 556                 int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
 557                 ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
 558             }
 559             else
 560             /* FIXME: for i16x16 dc is this weight optimal? */
 561                 ssd = (int64_t)d*d * (dc?256:coef_weight[i]);
 562
 563             for( j = 0; j < 8; j++ )
 564             {
 565                 int node_ctx = j;
 566                 if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
 567                     continue;
 568                 n = nodes_prev[j];
 569
 570                 /* code the proposed level, and count how much entropy it would take */
 571                 if( abs_level || node_ctx )
 572                 {
 573                     unsigned f8_bits = cost_sig[ abs_level != 0 ];
 574                     if( abs_level )
 575                     {
 576                         const int i_prefix = X264_MIN( abs_level - 1, 14 );
 577                         f8_bits += cost_last[ node_ctx == 0 ];
 578                         f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
 579                         if( i_prefix > 0 )
 580                         {
 581                             uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
 582                             f8_bits += cabac_size_unary[i_prefix][*ctx];
 583                             *ctx = cabac_transition_unary[i_prefix][*ctx];
 584                             if( abs_level >= 15 )
 585                                 f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
 586                             node_ctx = coeff_abs_level_transition[1][node_ctx];
 587                         }
 588                         else
 589                         {
 590                             f8_bits += 1 << CABAC_SIZE_BITS;
 591                             node_ctx = coeff_abs_level_transition[0][node_ctx];
 592                         }
 593                     }
 594                     n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 595                 }
 596
 597                 n.score += ssd;
 598
 599                 /* save the node if it's better than any existing node with the same cabac ctx */
 600                 if( n.score < nodes_cur[node_ctx].score )
 601                 {
 602                     SET_LEVEL( n, abs_level );
 603                     nodes_cur[node_ctx] = n;
 604                 }
 605             }
 606         }
 607     }
 608
 609     /* output levels from the best path through the trellis */
 610     bnode = &nodes_cur[0];
 611     for( j = 1; j < 8; j++ )
 612         if( nodes_cur[j].score < bnode->score )
 613             bnode = &nodes_cur[j];
 614
 615     j = bnode->level_idx;
 616     for( i = b_ac; i < i_coefs; i++ )
 617     {
 618         dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
 619         j = level_tree[j].next;
 620     }
 621 }
 622
 623 const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
 624
 625 void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
 626                             int i_qp, int i_ctxBlockCat, int b_intra )
 627 {
 628     quant_trellis_cabac( h, (int16_t*)dct,
 629         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
 630         NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
 631         i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
 632 }
 633
 634 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
 635                              int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 636 {
 637     int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
 638     quant_trellis_cabac( h, (int16_t*)dct,
 639         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
 640         x264_dct4_weight2_zigzag[h->mb.b_interlaced],
 641         x264_zigzag_scan4[h->mb.b_interlaced],
 642         i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
 643 }
 644
 645 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
 646                              int i_qp, int b_intra, int idx )
 647 {
 648     quant_trellis_cabac( h, (int16_t*)dct,
 649         h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
 650         x264_dct8_weight2_zigzag[h->mb.b_interlaced],
 651         x264_zigzag_scan8[h->mb.b_interlaced],
 652         DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 0, 64, idx );
 653 }
 654