git.sesse.net Git - x264/blob - encoder/rdo.c

   1 /*****************************************************************************
   2  * rdo.c: h264 encoder library (rate-distortion optimization)
   3  *****************************************************************************
   4  * Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  19  *****************************************************************************/
  20
  21 /* duplicate all the writer functions, just calculating bit cost
  22  * instead of writing the bitstream.
  23  * TODO: use these for fast 1st pass too. */
  24
  25 #define RDO_SKIP_BS
  26
  27 static uint8_t cabac_prefix_transition[15][128];
  28 static uint16_t cabac_prefix_size[15][128];
  29
  30 /* CAVLC: produces exactly the same bit count as a normal encode */
  31 /* this probably still leaves some unnecessary computations */
  32 #define bs_write1(s,v)     ((s)->i_bits_encoded += 1)
  33 #define bs_write(s,n,v)    ((s)->i_bits_encoded += (n))
  34 #define bs_write_ue(s,v)   ((s)->i_bits_encoded += bs_size_ue(v))
  35 #define bs_write_se(s,v)   ((s)->i_bits_encoded += bs_size_se(v))
  36 #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l))
  37 #define x264_macroblock_write_cavlc  x264_macroblock_size_cavlc
  38 #include "cavlc.c"
  39
  40 /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of
  41  * fractional bits, but only finite precision. */
  42 #undef  x264_cabac_encode_decision
  43 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
  44 #define x264_cabac_encode_terminal(c)     x264_cabac_size_decision(c,276,0)
  45 #define x264_cabac_encode_bypass(c,v)     ((c)->f8_bits_encoded += 256)
  46 #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue(v+(1<<e)-1)-e)<<8)
  47 #define x264_cabac_encode_flush(h,c)
  48 #define x264_macroblock_write_cabac  x264_macroblock_size_cabac
  49 #include "cabac.c"
  50
  51 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
  52         sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
  53
  54 static int ssd_mb( x264_t *h )
  55 {
  56     return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
  57                                      h->mb.pic.p_fdec[0], FDEC_STRIDE )
  58          + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[1], FENC_STRIDE,
  59                                      h->mb.pic.p_fdec[1], FDEC_STRIDE )
  60          + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[2], FENC_STRIDE,
  61                                      h->mb.pic.p_fdec[2], FDEC_STRIDE );
  62 }
  63
  64 static int ssd_plane( x264_t *h, int size, int p, int x, int y )
  65 {
  66     return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
  67                               h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
  68 }
  69
  70 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
  71 {
  72     int b_transform_bak = h->mb.b_transform_8x8;
  73     int i_ssd;
  74     int i_bits;
  75
  76     x264_macroblock_encode( h );
  77
  78     i_ssd = ssd_mb( h );
  79
  80     if( IS_SKIP( h->mb.i_type ) )
  81     {
  82         i_bits = (1 * i_lambda2 + 128) >> 8;
  83     }
  84     else if( h->param.b_cabac )
  85     {
  86         x264_cabac_t cabac_tmp;
  87         COPY_CABAC;
  88         x264_macroblock_size_cabac( h, &cabac_tmp );
  89         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
  90     }
  91     else
  92     {
  93         bs_t bs_tmp = h->out.bs;
  94         bs_tmp.i_bits_encoded = 0;
  95         x264_macroblock_size_cavlc( h, &bs_tmp );
  96         i_bits = ( bs_tmp.i_bits_encoded * i_lambda2 + 128 ) >> 8;
  97     }
  98
  99     h->mb.b_transform_8x8 = b_transform_bak;
 100
 101     return i_ssd + i_bits;
 102 }
 103
 104 int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
 105 {
 106     int i_ssd, i_bits;
 107
 108     if( i_pixel == PIXEL_16x16 )
 109     {
 110         int type_bak = h->mb.i_type;
 111         int i_cost = x264_rd_cost_mb( h, i_lambda2 );
 112         h->mb.i_type = type_bak;
 113         return i_cost;
 114     }
 115
 116     x264_macroblock_encode_p8x8( h, i8 );
 117     if( i_pixel == PIXEL_16x8 )
 118         x264_macroblock_encode_p8x8( h, i8+1 );
 119     if( i_pixel == PIXEL_8x16 )
 120         x264_macroblock_encode_p8x8( h, i8+2 );
 121
 122     i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 )
 123           + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
 124           + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
 125
 126     if( h->param.b_cabac )
 127     {
 128         x264_cabac_t cabac_tmp;
 129         COPY_CABAC;
 130         x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
 131         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
 132     }
 133     else
 134     {
 135         i_bits = ( x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2 + 128 ) >> 8;
 136     }
 137
 138     return i_ssd + i_bits;
 139 }
 140
 141 int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
 142 {
 143     int i_ssd, i_bits;
 144
 145     x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
 146     i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
 147
 148     if( h->param.b_cabac )
 149     {
 150         x264_cabac_t cabac_tmp;
 151         COPY_CABAC;
 152         x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
 153         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
 154     }
 155     else
 156     {
 157         i_bits = ( x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2 + 128 ) >> 8;
 158     }
 159
 160     return i_ssd + i_bits;
 161 }
 162
 163 int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
 164 {
 165     int i_ssd, i_bits;
 166
 167     x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
 168     i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
 169
 170     if( h->param.b_cabac )
 171     {
 172         x264_cabac_t cabac_tmp;
 173         COPY_CABAC;
 174         x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
 175         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
 176     }
 177     else
 178     {
 179         i_bits = ( x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2 + 128 ) >> 8;
 180     }
 181
 182     return i_ssd + i_bits;
 183 }
 184
 185 int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
 186 {
 187     int i_ssd, i_bits;
 188
 189     if( b_dct )
 190         x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
 191     i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
 192             ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
 193
 194     h->mb.i_chroma_pred_mode = i_mode;
 195
 196     if( h->param.b_cabac )
 197     {
 198         x264_cabac_t cabac_tmp;
 199         COPY_CABAC;
 200         x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
 201         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
 202     }
 203     else
 204     {
 205         i_bits = ( x264_i8x8_chroma_size_cavlc( h ) * i_lambda2 + 128 ) >> 8;
 206     }
 207
 208     return i_ssd + i_bits;
 209 }
 210 /****************************************************************************
 211  * Trellis RD quantization
 212  ****************************************************************************/
 213
 214 #define TRELLIS_SCORE_MAX ((uint64_t)1<<50)
 215 #define CABAC_SIZE_BITS 8
 216 #define SSD_WEIGHT_BITS 5
 217 #define LAMBDA_BITS 4
 218
 219 /* precalculate the cost of coding abs_level_m1 */
 220 void x264_rdo_init( )
 221 {
 222     int i_prefix;
 223     int i_ctx;
 224     for( i_prefix = 0; i_prefix < 15; i_prefix++ )
 225     {
 226         for( i_ctx = 0; i_ctx < 128; i_ctx++ )
 227         {
 228             int f8_bits = 0;
 229             uint8_t ctx = i_ctx;
 230             int i;
 231
 232             for( i = 1; i < i_prefix; i++ )
 233                 f8_bits += x264_cabac_size_decision2( &ctx, 1 );
 234             if( i_prefix > 0 && i_prefix < 14 )
 235                 f8_bits += x264_cabac_size_decision2( &ctx, 0 );
 236             f8_bits += 1 << CABAC_SIZE_BITS; //sign
 237
 238             cabac_prefix_size[i_prefix][i_ctx] = f8_bits;
 239             cabac_prefix_transition[i_prefix][i_ctx] = ctx;
 240         }
 241     }
 242 }
 243
 244 // should the intra and inter lambdas be different?
 245 // I'm just matching the behaviour of deadzone quant.
 246 static const int lambda2_tab[2][52] = {
 247     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
 248     {    46,      58,      73,      92,     117,     147,
 249         185,     233,     294,     370,     466,     587,
 250         740,     932,    1174,    1480,    1864,    2349,
 251        2959,    3728,    4697,    5918,    7457,    9395,
 252       11837,   14914,   18790,   23674,   29828,   37581,
 253       47349,   59656,   75163,   94699,  119313,  150326,
 254      189399,  238627,  300652,  378798,  477255,  601304,
 255      757596,  954511, 1202608, 1515192, 1909022, 2405217,
 256     3030384, 3818045, 4810435, 6060769 },
 257     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
 258     {    27,      34,      43,      54,      68,      86,
 259         108,     136,     172,     216,     273,     343,
 260         433,     545,     687,     865,    1090,    1374,
 261        1731,    2180,    2747,    3461,    4361,    5494,
 262        6922,    8721,   10988,   13844,   17442,   21976,
 263       27688,   34885,   43953,   55377,   69771,   87906,
 264      110755,  139543,  175813,  221511,  279087,  351627,
 265      443023,  558174,  703255,  886046, 1116348, 1406511,
 266     1772093, 2232697, 2813022, 3544186 }
 267 };
 268
 269 typedef struct {
 270     uint64_t score;
 271     int level_idx; // index into level_tree[]
 272     uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1
 273 } trellis_node_t;
 274
 275 // TODO:
 276 // support chroma and i16x16 DC
 277 // save cabac state between blocks?
 278 // use trellis' RD score instead of x264_mb_decimate_score?
 279 // code 8x8 sig/last flags forwards with deadzone and save the contexts at
 280 //   each position?
 281 // change weights when using CQMs?
 282
 283 // possible optimizations:
 284 // make scores fit in 32bit
 285 // save quantized coefs during rd, to avoid a duplicate trellis in the final encode
 286 // if trellissing all MBRD modes, finish SSD calculation so we can skip all of
 287 //   the normal dequant/idct/ssd/cabac
 288
 289 // the unquant_mf here is not the same as dequant_mf:
 290 // in normal operation (dct->quant->dequant->idct) the dct and idct are not
 291 // normalized. quant/dequant absorb those scaling factors.
 292 // in this function, we just do (quant->unquant) and want the output to be
 293 // comparable to the input. so unquant is the direct inverse of quant,
 294 // and uses the dct scaling factors, not the idct ones.
 295
 296 static inline void quant_trellis_cabac( x264_t *h, int16_t *dct,
 297                                  const uint16_t *quant_mf, const int *unquant_mf,
 298                                  const int *coef_weight, const uint8_t *zigzag,
 299                                  int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs )
 300 {
 301     int abs_coefs[64], signs[64];
 302     trellis_node_t nodes[2][8];
 303     trellis_node_t *nodes_cur = nodes[0];
 304     trellis_node_t *nodes_prev = nodes[1];
 305     trellis_node_t *bnode;
 306     uint8_t cabac_state_sig[64];
 307     uint8_t cabac_state_last[64];
 308     const int b_interlaced = h->mb.b_interlaced;
 309     const int f = 1 << 15; // no deadzone
 310     int i_last_nnz;
 311     int i, j;
 312
 313     // (# of coefs) * (# of ctx) * (# of levels tried) = 1024
 314     // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough,
 315     // but it takes more time to remove dead states than you gain in reduced memory.
 316     struct {
 317         uint16_t abs_level;
 318         uint16_t next;
 319     } level_tree[64*8*2];
 320     int i_levels_used = 1;
 321
 322     /* init coefs */
 323     for( i = i_coefs-1; i >= b_ac; i-- )
 324         if( (unsigned)(dct[zigzag[i]] * quant_mf[zigzag[i]] + f-1) >= 2*f )
 325             break;
 326
 327     if( i < b_ac )
 328     {
 329         memset( dct, 0, i_coefs * sizeof(*dct) );
 330         return;
 331     }
 332
 333     i_last_nnz = i;
 334
 335     for( ; i >= b_ac; i-- )
 336     {
 337         int coef = dct[zigzag[i]];
 338         abs_coefs[i] = abs(coef);
 339         signs[i] = coef < 0 ? -1 : 1;
 340     }
 341
 342     /* init trellis */
 343     for( i = 1; i < 8; i++ )
 344         nodes_cur[i].score = TRELLIS_SCORE_MAX;
 345     nodes_cur[0].score = 0;
 346     nodes_cur[0].level_idx = 0;
 347     level_tree[0].abs_level = 0;
 348     level_tree[0].next = 0;
 349
 350     // coefs are processed in reverse order, because that's how the abs value is coded.
 351     // last_coef and significant_coef flags are normally coded in forward order, but
 352     // we have to reverse them to match the levels.
 353     // in 4x4 blocks, last_coef and significant_coef use a separate context for each
 354     // position, so the order doesn't matter, and we don't even have to update their contexts.
 355     // in 8x8 blocks, some positions share contexts, so we'll just have to hope that
 356     // cabac isn't too sensitive.
 357
 358     if( i_coefs == 64 )
 359     {
 360         const uint8_t *ctx_sig  = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
 361         const uint8_t *ctx_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ];
 362         for( i = 0; i < 63; i++ )
 363         {
 364             cabac_state_sig[i]  = ctx_sig[ significant_coeff_flag_offset_8x8[b_interlaced][i] ];
 365             cabac_state_last[i] = ctx_last[ last_coeff_flag_offset_8x8[i] ];
 366         }
 367     }
 368     else
 369     {
 370         memcpy( cabac_state_sig,  &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
 371         memcpy( cabac_state_last, &h->cabac.state[ last_coeff_flag_offset[b_interlaced][i_ctxBlockCat] ], 15 );
 372     }
 373     memcpy( nodes_cur[0].cabac_state, &h->cabac.state[ coeff_abs_level_m1_offset[i_ctxBlockCat] ], 10 );
 374
 375     for( i = i_last_nnz; i >= b_ac; i-- )
 376     {
 377         int i_coef = abs_coefs[i];
 378         int q = ( f + i_coef * quant_mf[zigzag[i]] ) >> 16;
 379         int abs_level;
 380         int cost_sig[2], cost_last[2];
 381         trellis_node_t n;
 382
 383         // skip 0s: this doesn't affect the output, but saves some unnecessary computation.
 384         if( q == 0 )
 385         {
 386             // no need to calculate ssd of 0s: it's the same in all nodes.
 387             // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
 388             const uint32_t cost_sig0 = x264_cabac_size_decision_noup( &cabac_state_sig[i], 0 )
 389                                      * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 390             for( j = 1; j < 8; j++ )
 391             {
 392                 if( nodes_cur[j].score != TRELLIS_SCORE_MAX )
 393                 {
 394 #define SET_LEVEL(n,l) \
 395                     level_tree[i_levels_used].abs_level = l; \
 396                     level_tree[i_levels_used].next = n.level_idx; \
 397                     n.level_idx = i_levels_used; \
 398                     i_levels_used++;
 399
 400                     SET_LEVEL( nodes_cur[j], 0 );
 401                     nodes_cur[j].score += cost_sig0;
 402                 }
 403             }
 404             continue;
 405         }
 406
 407         XCHG( trellis_node_t*, nodes_cur, nodes_prev );
 408
 409         for( j = 0; j < 8; j++ )
 410             nodes_cur[j].score = TRELLIS_SCORE_MAX;
 411
 412         if( i < i_coefs-1 )
 413         {
 414             cost_sig[0] = x264_cabac_size_decision_noup( &cabac_state_sig[i], 0 );
 415             cost_sig[1] = x264_cabac_size_decision_noup( &cabac_state_sig[i], 1 );
 416             cost_last[0] = x264_cabac_size_decision_noup( &cabac_state_last[i], 0 );
 417             cost_last[1] = x264_cabac_size_decision_noup( &cabac_state_last[i], 1 );
 418         }
 419         else
 420         {
 421             cost_sig[0] = cost_sig[1] = 0;
 422             cost_last[0] = cost_last[1] = 0;
 423         }
 424
 425         // there are a few cases where increasing the coeff magnitude helps,
 426         // but it's only around .003 dB, and skipping them ~doubles the speed of trellis.
 427         // could also try q-2: that sometimes helps, but also sometimes decimates blocks
 428         // that are better left coded, especially at QP > 40.
 429         for( abs_level = q; abs_level >= q-1; abs_level-- )
 430         {
 431             int d = i_coef - ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
 432             uint64_t ssd = (int64_t)d*d * coef_weight[i];
 433
 434             for( j = 0; j < 8; j++ )
 435             {
 436                 int node_ctx = j;
 437                 if( nodes_prev[j].score == TRELLIS_SCORE_MAX )
 438                     continue;
 439                 n = nodes_prev[j];
 440
 441                 /* code the proposed level, and count how much entropy it would take */
 442                 if( abs_level || node_ctx )
 443                 {
 444                     unsigned f8_bits = cost_sig[ abs_level != 0 ];
 445                     if( abs_level )
 446                     {
 447                         const int i_prefix = X264_MIN( abs_level - 1, 14 );
 448                         f8_bits += cost_last[ node_ctx == 0 ];
 449                         f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
 450                         if( i_prefix > 0 )
 451                         {
 452                             uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
 453                             f8_bits += cabac_prefix_size[i_prefix][*ctx];
 454                             *ctx = cabac_prefix_transition[i_prefix][*ctx];
 455                             if( abs_level >= 15 )
 456                                 f8_bits += bs_size_ue( abs_level - 15 ) << CABAC_SIZE_BITS;
 457                             node_ctx = coeff_abs_level_transition[1][node_ctx];
 458                         }
 459                         else
 460                         {
 461                             f8_bits += 1 << CABAC_SIZE_BITS;
 462                             node_ctx = coeff_abs_level_transition[0][node_ctx];
 463                         }
 464                     }
 465                     n.score += (uint64_t)f8_bits * i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
 466                 }
 467
 468                 n.score += ssd;
 469
 470                 /* save the node if it's better than any existing node with the same cabac ctx */
 471                 if( n.score < nodes_cur[node_ctx].score )
 472                 {
 473                     SET_LEVEL( n, abs_level );
 474                     nodes_cur[node_ctx] = n;
 475                 }
 476             }
 477         }
 478     }
 479
 480     /* output levels from the best path through the trellis */
 481     bnode = &nodes_cur[0];
 482     for( j = 1; j < 8; j++ )
 483         if( nodes_cur[j].score < bnode->score )
 484             bnode = &nodes_cur[j];
 485
 486     j = bnode->level_idx;
 487     for( i = b_ac; i < i_coefs; i++ )
 488     {
 489         dct[zigzag[i]] = level_tree[j].abs_level * signs[i];
 490         j = level_tree[j].next;
 491     }
 492 }
 493
 494
 495 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
 496                              int i_qp, int i_ctxBlockCat, int b_intra )
 497 {
 498     int b_ac = (i_ctxBlockCat == DCT_LUMA_AC);
 499     quant_trellis_cabac( h, (int16_t*)dct,
 500         h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
 501         x264_dct4_weight2_zigzag[h->mb.b_interlaced],
 502         x264_zigzag_scan4[h->mb.b_interlaced],
 503         i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 );
 504 }
 505
 506
 507 void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
 508                              int i_qp, int b_intra )
 509 {
 510     quant_trellis_cabac( h, (int16_t*)dct,
 511         h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp],
 512         x264_dct8_weight2_zigzag[h->mb.b_interlaced],
 513         x264_zigzag_scan8[h->mb.b_interlaced],
 514         DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 );
 515 }
 516