git.sesse.net Git - x264/blob - encoder/rdo.c

   1 /*****************************************************************************
   2  * rdo.c: h264 encoder library (rate-distortion optimization)
   3  *****************************************************************************
   4  * Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  19  *****************************************************************************/
  20
  21 /* duplicate all the writer functions, just calculating bit cost
  22  * instead of writing the bitstream.
  23  * TODO: use these for fast 1st pass too. */
  24
  25 #define RDO_SKIP_BS 1
  26
  27 /* Transition and size tables for abs<9 MVD and residual coding */
  28 /* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */
  29 static uint8_t cabac_transition_unary[15][128];
  30 static uint16_t cabac_size_unary[15][128];
  31 /* Transition and size tables for abs>9 MVD */
  32 /* Consist of 5 1s and a bypass sign bit */
  33 static uint8_t cabac_transition_5ones[128];
  34 static uint16_t cabac_size_5ones[128];
  35
  36 /* CAVLC: produces exactly the same bit count as a normal encode */
  37 /* this probably still leaves some unnecessary computations */
  38 #define bs_write1(s,v)     ((s)->i_bits_encoded += 1)
  39 #define bs_write(s,n,v)    ((s)->i_bits_encoded += (n))
  40 #define bs_write_ue(s,v)   ((s)->i_bits_encoded += bs_size_ue(v))
  41 #define bs_write_se(s,v)   ((s)->i_bits_encoded += bs_size_se(v))
  42 #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
  43 #define x264_macroblock_write_cavlc  static x264_macroblock_size_cavlc
  44 #include "cavlc.c"
  45
  46 /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
  47  * fractional bits, but only finite precision. */
  48 #undef  x264_cabac_encode_decision
  49 #undef  x264_cabac_encode_decision_noup
  50 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
  51 #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v)
  52 #define x264_cabac_encode_terminal(c)     x264_cabac_size_decision_noup(c,276,0)
  53 #define x264_cabac_encode_bypass(c,v)     ((c)->f8_bits_encoded += 256)
  54 #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
  55 #define x264_cabac_encode_flush(h,c)
  56 #define x264_macroblock_write_cabac  static x264_macroblock_size_cabac
  57 #include "cabac.c"
  58
  59 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
  60         sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
  61
  62
  63 /* Sum the cached SATDs to avoid repeating them. */
  64 static inline int sum_satd( x264_t *h, int pixel, int x, int y )
  65 {
  66     int satd = 0;
  67     int min_x = x>>2;
  68     int min_y = y>>2;
  69     int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
  70     int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
  71     if( pixel == PIXEL_16x16 )
  72         return h->mb.pic.fenc_satd_sum;
  73     for( y = min_y; y < max_y; y++ )
  74         for( x = min_x; x < max_x; x++ )
  75             satd += h->mb.pic.fenc_satd[y][x];
  76     return satd;
  77 }
  78
  79 static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
  80 {
  81     int sa8d = 0;
  82     int min_x = x>>3;
  83     int min_y = y>>3;
  84     int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
  85     int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
  86     if( pixel == PIXEL_16x16 )
  87         return h->mb.pic.fenc_sa8d_sum;
  88     for( y = min_y; y < max_y; y++ )
  89         for( x = min_x; x < max_x; x++ )
  90             sa8d += h->mb.pic.fenc_sa8d[y][x];
  91     return sa8d;
  92 }
  93
  94 /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
  95 /* SATD and SA8D are used to measure block complexity. */
  96 /* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size.  Using SATD */
  97 /* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */
  98
  99 /* FIXME:  Is there a better metric than averaged SATD/SA8D difference for complexity difference? */
 100 /* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */
 101 /* This optimization can also be used in non-RD transform decision. */
 102
 103 static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
 104 {
 105     DECLARE_ALIGNED_16(static uint8_t zero[16]);
 106     int satd = 0;
 107     uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
 108     uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
 109     if( p == 0 && h->mb.i_psy_rd )
 110     {
 111         /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
 112         if( size <= PIXEL_8x8 )
 113         {
 114             uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
 115             satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
 116                  + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
 117             satd >>= 1;
 118         }
 119         else
 120         {
 121             int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
 122             satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
 123         }
 124         satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8;
 125     }
 126     return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd;
 127 }
 128
 129 static inline int ssd_mb( x264_t *h )
 130 {
 131     return ssd_plane(h, PIXEL_16x16, 0, 0, 0)
 132          + ssd_plane(h, PIXEL_8x8,   1, 0, 0)
 133          + ssd_plane(h, PIXEL_8x8,   2, 0, 0);
 134 }
 135
 136 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
 137 {
 138     int b_transform_bak = h->mb.b_transform_8x8;
 139     int i_ssd;
 140     int i_bits;
 141
 142     x264_macroblock_encode( h );
 143
 144     i_ssd = ssd_mb( h );
 145
 146     if( IS_SKIP( h->mb.i_type ) )
 147     {
 148         i_bits = (1 * i_lambda2 + 128) >> 8;
 149     }
 150     else if( h->param.b_cabac )
 151     {
 152         x264_cabac_t cabac_tmp;
 153         COPY_CABAC;
 154         x264_macroblock_size_cabac( h, &cabac_tmp );
 155         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
 156     }
 157     else
 158     {
 159         bs_t bs_tmp = h->out.bs;
 160         bs_tmp.i_bits_encoded = 0;
 161         x264_macroblock_size_cavlc( h, &bs_tmp );
 162         i_bits = ( bs_tmp.i_bits_encoded * i_lambda2 + 128 ) >> 8;
 163     }
 164
 165     h->mb.b_transform_8x8 = b_transform_bak;
 166
 167     return i_ssd + i_bits;
 168 }
 169
 170 /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */
 171
 172 static uint64_t x264_rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel )
 173 {
 174     uint64_t i_ssd, i_bits;
 175
 176     x264_macroblock_encode_p4x4( h, i4 );
 177     if( i_pixel == PIXEL_8x4 )
 178         x264_macroblock_encode_p4x4( h, i4+1 );
 179     if( i_pixel == PIXEL_4x8 )
 180         x264_macroblock_encode_p4x4( h, i4+2 );
 181
 182     i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
 183
 184     if( h->param.b_cabac )
 185     {
 186         x264_cabac_t cabac_tmp;
 187         COPY_CABAC;
 188         x264_subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel );
 189         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 190     }
 191     else
 192     {
 193         i_bits = x264_subpartition_size_cavlc( h, i4, i_pixel );
 194     }
 195
 196     return (i_ssd<<8) + i_bits;
 197 }
 198
 199 uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
 200 {
 201     uint64_t i_ssd, i_bits;
 202     int i8 = i4 >> 2;
 203
 204     if( i_pixel == PIXEL_16x16 )
 205     {
 206         int type_bak = h->mb.i_type;
 207         int i_cost = x264_rd_cost_mb( h, i_lambda2 );
 208         h->mb.i_type = type_bak;
 209         return i_cost;
 210     }
 211
 212     if( i_pixel > PIXEL_8x8 )
 213         return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel );
 214
 215     x264_macroblock_encode_p8x8( h, i8 );
 216     if( i_pixel == PIXEL_16x8 )
 217         x264_macroblock_encode_p8x8( h, i8+1 );
 218     if( i_pixel == PIXEL_8x16 )
 219         x264_macroblock_encode_p8x8( h, i8+2 );
 220
 221     i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 )
 222           + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
 223           + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
 224
 225     if( h->param.b_cabac )
 226     {
 227         x264_cabac_t cabac_tmp;
 228         COPY_CABAC;
 229         x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
 230         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 231     }
 232     else
 233     {
 234         i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
 235     }
 236
 237     return (i_ssd<<8) + i_bits;
 238 }
 239
 240 static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
 241 {
 242     uint64_t i_ssd, i_bits;
 243
 244     x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
 245     i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
 246
 247     if( h->param.b_cabac )
 248     {
 249         x264_cabac_t cabac_tmp;
 250         COPY_CABAC;
 251         x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
 252         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 253     }
 254     else
 255     {
 256         i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
 257     }
 258
 259     return (i_ssd<<8) + i_bits;
 260 }
 261
 262 static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
 263 {
 264     uint64_t i_ssd, i_bits;
 265
 266     x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
 267     i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
 268
 269     if( h->param.b_cabac )
 270     {
 271         x264_cabac_t cabac_tmp;
 272         COPY_CABAC;
 273         x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
 274         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 275     }
 276     else
 277     {
 278         i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
 279     }
 280
 281     return (i_ssd<<8) + i_bits;
 282 }
 283
 284 static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
 285 {
 286     uint64_t i_ssd, i_bits;
 287
 288     if( b_dct )
 289         x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
 290     i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
 291             ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
 292
 293     h->mb.i_chroma_pred_mode = i_mode;
 294
 295     if( h->param.b_cabac )
 296     {
 297         x264_cabac_t cabac_tmp;
 298         COPY_CABAC;
 299         x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
 300         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
 301     }
 302     else
 303     {
 304         i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
 305     }
 306
 307     return (i_ssd<<8) + i_bits;
 308 }
 309 /****************************************************************************
 310  * Trellis RD quantization
 311  ****************************************************************************/
 312
 313 #define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
 314 #define CABAC_SIZE_BITS 8
 315 #define SSD_WEIGHT_BITS 5
 316 #define LAMBDA_BITS 4
 317
 318 /* precalculate the cost of coding various combinations of bits in a single context */
 319 void x264_rdo_init( void )
 320 {
 321     int i_prefix, i_ctx, i;
 322     for( i_prefix = 0; i_prefix < 15; i_prefix++ )
 323     {
 324         for( i_ctx = 0; i_ctx < 128; i_ctx++ )
 325         {
 326             int f8_bits = 0;
 327             uint8_t ctx = i_ctx;
 328
 329             for( i = 1; i < i_prefix; i++ )
 330                 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
 331             if( i_prefix > 0 && i_prefix < 14 )
 332                 f8_bits += x264_cabac_size_decision2( &ctx, 0 );
 333             f8_bits += 1 << CABAC_SIZE_BITS; //sign
 334
 335             cabac_size_unary[i_prefix][i_ctx] = f8_bits;
 336             cabac_transition_unary[i_prefix][i_ctx] = ctx;
 337         }
 338     }
 339     for( i_ctx = 0; i_ctx < 128; i_ctx++ )
 340     {
 341         int f8_bits = 0;
 342         uint8_t ctx = i_ctx;
 343
 344         for( i = 0; i < 5; i++ )
 345             f8_bits += x264_cabac_size_decision2( &ctx, 1 );
 346         f8_bits += 1 << CABAC_SIZE_BITS; //sign
 347
 348         cabac_size_5ones[i_ctx] = f8_bits;
 349         cabac_transition_5ones[i_ctx] = ctx;
 350     }
 351 }
 352
 353 // should the intra and inter lambdas be different?
 354 // I'm just matching the behaviour of deadzone quant.
 355 static const int lambda2_tab[2][52] = {
 356     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 357     {    46,      58,      73,      92,     117,     147,
 358         185,     233,     294,     370,     466,     587,
 359         740,     932,    1174,    1480,    1864,    2349,
 360        2959,    3728,    4697,    5918,    7457,    9395,
 361       11837,   14914,   18790,   23674,   29828,   37581,
 362       47349,   59656,   75163,   94699,  119313,  150326,
 363      189399,  238627,  300652,  378798,  477255,  601304,
 364      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 365     3030384, 3818045, 4810435, 6060769 },
 366     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 367     {    27,      34,      43,      54,      68,      86,
 368         108,     136,     172,     216,     273,     343,
 369         433,     545,     687,     865,    1090,    1374,
 370        1731,    2180,    2747,    3461,    4361,    5494,
 371        6922,    8721,   10988,   13844,   17442,   21976,
 372       27688,   34885,   43953,   55377,   69771,   87906,
 373      110755,  139543,  175813,  221511,  279087,  351627,
 374      443023,  558174,  703255,  886046, 1116348, 1406511,
 375     1772093, 2232697, 2813022, 3544186 }
 376 };
 377
 378 typedef struct {
 379     int64_t score;
 380     int level_idx; // index into level_tree[]
 381     uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
 382 } trellis_node_t;
 383
 384 // TODO:
 385 // save cabac state between blocks?
 386 // use trellis' RD score instead of x264_mb_decimate_score?
 387 // code 8x8 sig/last flags forwards with deadzone and save the contexts at
 388 //   each position?
 389 // change weights when using CQMs?
 390
 391 // possible optimizations:
 392 // make scores fit in 32bit
 393 // save quantized coefs during rd, to avoid a duplicate trellis in the final encode
 394 // if trellissing all MBRD modes, finish SSD calculation so we can skip all of
 395 //   the normal dequant/idct/ssd/cabac
 396
 397 // the unquant_mf here is not the same as dequant_mf:
 398 // in normal operation (dct->quant->dequant->idct) the dct and idct are not
 399 // normalized. quant/dequant absorb those scaling factors.
 400 // in this function, we just do (quant->unquant) and want the output to be
 401 // comparable to the input. so unquant is the direct inverse of quant,
 402 // and uses the dct scaling factors, not the idct ones.
 403
 404 static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct,
 405                                  const uint16_t *quant_mf, const int *unquant_mf,
 406                                  const int *coef_weight, const uint8_t *zigzag,
 407                                  int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx )
 408 {
 409     int abs_coefs[64], signs[64];
 410     trellis_node_t nodes[2][8];
 411     trellis_node_t *nodes_cur = nodes[0];
 412     trellis_node_t *nodes_prev = nodes[1];
 413     trellis_node_t *bnode;
 414     uint8_t cabac_state_sig[64];
 415     uint8_t cabac_state_last[64];
 416     const int b_interlaced = h->mb.b_interlaced;
 417     const int f = 1 << 15; // no deadzone
 418     int i_last_nnz;
 419     int i, j;
 420
 421     // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
 422     // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
 423     // but it takes more time to remove dead states than you gain in reduced memory.
 424     struct {
 425         uint16_t abs_level;
 426         uint16_t next;
 427     } level_tree[64*8*2];
 428     int i_levels_used = 1;
 429
 430     /* init coefs */
 431     for( i = i_coefs-1; i >= b_ac; i-- )
 432         if( (unsigned)(dct[zigzag[i]] * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) + f-1) >= 2*f )
 433             break;
 434
 435     if( i < b_ac )
 436     {
 437         memset( dct, 0, i_coefs * sizeof(*dct) );
 438         return;
 439     }
 440
 441     i_last_nnz = i;
 442
 443     for( ; i >= b_ac; i-- )
 444     {
 445         int coef = dct[zigzag[i]];
 446         abs_coefs[i] = abs(coef);
 447         signs[i] = coef < 0 ? -1 : 1;
 448     }
 449
 450     /* init trellis */
 451     for( i = 1; i < 8; i++ )
 452         nodes_cur[i].score = TRELLIS_SCORE_MAX;
 453     nodes_cur[0].score = 0;
 454     nodes_cur[0].level_idx = 0;
 455     level_tree[0].abs_level = 0;
 456     level_tree[0].next = 0;
 457
 458     // coefs are processed in reverse order, because that's how the abs value is coded.
 459     // last_coef and significant_coef flags are normally coded in forward order, but
 460     // we have to reverse them to match the levels.
 461     // in 4x4 blocks, last_coef and significant_coef use a separate context for each
 462     // position, so the order doesn't matter, and we don't even have to update their contexts.
 463     // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
 464     // cabac isn't too sensitive.
 465
 466     if( i_coefs == 64 )
 467     {
 468         const uint8_t *ctx_sig  = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
 469         const uint8_t *ctx_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
 470         for( i = 0; i < 63; i++ )
 471         {
 472             cabac_state_sig[i]  = ctx_sig[ significant_coeff_flag_offset_8x8[b_interlaced][i] ];
 473             cabac_state_last[i] = ctx_last[ last_coeff_flag_offset_8x8[i] ];
 474         }
 475     }
 476     else if( !dc || i_ctxBlockCat != DCT_CHROMA_DC )
 477     {
 478         memcpy( cabac_state_sig,  &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
 479         memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
 480     }
 481     else
 482     {
 483         memcpy( cabac_state_sig,  &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
 484         memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 3 );
 485     }
 486     memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );
 487
 488     for( i = i_last_nnz; i >= b_ac; i-- )
 489     {
 490         int i_coef = abs_coefs[i];
 491         int q = ( f + i_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) ) >> 16;
 492         int abs_level;
 493         int cost_sig[2], cost_last[2];
 494         trellis_node_t n;
 495
 496         // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
 497         if( q == 0 )
 498         {
 499             // no need to calculate ssd of 0s: it's the same in all nodes.
 500             // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
 501             const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 0 )
 502                                      * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 503             for( j = 1; j < 8; j++ )
 504             {
 505                 if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
 506                 {
 507 #define SET_LEVEL(n,l) \
 508                     level_tree[i_levels_used].abs_level = l; \
 509                     level_tree[i_levels_used].next = n.level_idx; \
 510                     n.level_idx = i_levels_used; \
 511                     i_levels_used++;
 512
 513                     SET_LEVEL( nodes_cur[j], 0 );
 514                     nodes_cur[j].score += cost_sig0;
 515                 }
 516             }
 517             continue;
 518         }
 519
 520         XCHG( trellis_node_t*, nodes_cur, nodes_prev );
 521
 522         for( j = 0; j < 8; j++ )
 523             nodes_cur[j].score = TRELLIS_SCORE_MAX;
 524
 525         if( i < i_coefs-1 )
 526         {
 527             cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 0 );
 528             cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[i], 1 );
 529             cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[i], 0 );
 530             cost_last[1] = x264_cabac_size_decision_noup2( &cabac_state_last[i], 1 );
 531         }
 532         else
 533         {
 534             cost_sig[0] = cost_sig[1] = 0;
 535             cost_last[0] = cost_last[1] = 0;
 536         }
 537
 538         // there are a few cases where increasing the coeff magnitude helps,
 539         // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
 540         // could also try q-2: that sometimes helps, but also sometimes decimates blocks
 541         // that are better left coded, especially at QP > 40.
 542         for( abs_level = q; abs_level >= q-1; abs_level-- )
 543         {
 544             int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);
 545             int d = i_coef - unquant_abs_level;
 546             int64_t ssd;
 547             /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */
 548             if( h->mb.i_psy_trellis && i && !dc && i_ctxBlockCat != DCT_CHROMA_AC )
 549             {
 550                 int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i];
 551                 int predicted_coef = orig_coef - i_coef * signs[i];
 552                 int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]);
 553                 int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]];
 554                 ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value;
 555             }
 556             else
 557             /* FIXME: for i16x16 dc is this weight optimal? */
 558                 ssd = (int64_t)d*d * (dc?256:coef_weight[i]);
 559
 560             for( j = 0; j < 8; j++ )
 561             {
 562                 int node_ctx = j;
 563                 if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
 564                     continue;
 565                 n = nodes_prev[j];
 566
 567                 /* code the proposed level, and count how much entropy it would take */
 568                 if( abs_level || node_ctx )
 569                 {
 570                     unsigned f8_bits = cost_sig[ abs_level != 0 ];
 571                     if( abs_level )
 572                     {
 573                         const int i_prefix = X264_MIN( abs_level - 1, 14 );
 574                         f8_bits += cost_last[ node_ctx == 0 ];
 575                         f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
 576                         if( i_prefix > 0 )
 577                         {
 578                             uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
 579                             f8_bits += cabac_size_unary[i_prefix][*ctx];
 580                             *ctx = cabac_transition_unary[i_prefix][*ctx];
 581                             if( abs_level >= 15 )
 582                                 f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
 583                             node_ctx = coeff_abs_level_transition[1][node_ctx];
 584                         }
 585                         else
 586                         {
 587                             f8_bits += 1 << CABAC_SIZE_BITS;
 588                             node_ctx = coeff_abs_level_transition[0][node_ctx];
 589                         }
 590                     }
 591                     n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 592                 }
 593
 594                 n.score += ssd;
 595
 596                 /* save the node if it's better than any existing node with the same cabac ctx */
 597                 if( n.score < nodes_cur[node_ctx].score )
 598                 {
 599                     SET_LEVEL( n, abs_level );
 600                     nodes_cur[node_ctx] = n;
 601                 }
 602             }
 603         }
 604     }
 605
 606     /* output levels from the best path through the trellis */
 607     bnode = &nodes_cur[0];
 608     for( j = 1; j < 8; j++ )
 609         if( nodes_cur[j].score < bnode->score )
 610             bnode = &nodes_cur[j];
 611
 612     j = bnode->level_idx;
 613     for( i = b_ac; i < i_coefs; i++ )
 614     {
 615         dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
 616         j = level_tree[j].next;
 617     }
 618 }
 619
 620 const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
 621
 622 void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat,
 623                             int i_qp, int i_ctxBlockCat, int b_intra )
 624 {
 625     quant_trellis_cabac( h, (int16_t*)dct,
 626         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
 627         NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced],
 628         i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 );
 629 }
 630
 631 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
 632                              int i_qp, int i_ctxBlockCat, int b_intra, int idx )
 633 {
 634     int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC);
 635     quant_trellis_cabac( h, (int16_t*)dct,
 636         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
 637         x264_dct4_weight2_zigzag[h->mb.b_interlaced],
 638         x264_zigzag_scan4[h->mb.b_interlaced],
 639         i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx );
 640 }
 641
 642 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
 643                              int i_qp, int b_intra, int idx )
 644 {
 645     quant_trellis_cabac( h, (int16_t*)dct,
 646         h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
 647         x264_dct8_weight2_zigzag[h->mb.b_interlaced],
 648         x264_zigzag_scan8[h->mb.b_interlaced],
 649         DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 0, 64, idx );
 650 }
 651