git.sesse.net Git - x264/blob - encoder/macroblock.c

   1 /*****************************************************************************
   2  * macroblock.c: macroblock encoding
   3  *****************************************************************************
   4  * Copyright (C) 2003-2015 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *          Henrik Gramner <henrik@gramner.com>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  24  *
  25  * This program is also available under a commercial proprietary license.
  26  * For more information, contact us at licensing@x264.com.
  27  *****************************************************************************/
  28
  29 #include "common/common.h"
  30 #include "macroblock.h"
  31
  32 /* These chroma DC functions don't have assembly versions and are only used here. */
  33
  34 #define ZIG(i,y,x) level[i] = dct[x*2+y];
  35 static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
  36 {
  37     ZIG(0,0,0)
  38     ZIG(1,0,1)
  39     ZIG(2,1,0)
  40     ZIG(3,1,1)
  41 }
  42 #undef ZIG
  43
  44 static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
  45 {
  46     level[0] = dct[0];
  47     level[1] = dct[2];
  48     level[2] = dct[1];
  49     level[3] = dct[4];
  50     level[4] = dct[6];
  51     level[5] = dct[3];
  52     level[6] = dct[5];
  53     level[7] = dct[7];
  54 }
  55
  56 #define IDCT_DEQUANT_2X2_START \
  57     int d0 = dct[0] + dct[1]; \
  58     int d1 = dct[2] + dct[3]; \
  59     int d2 = dct[0] - dct[1]; \
  60     int d3 = dct[2] - dct[3]; \
  61     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  62
  63 static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
  64 {
  65     IDCT_DEQUANT_2X2_START
  66     dct4x4[0][0] = (d0 + d1) * dmf >> 5;
  67     dct4x4[1][0] = (d0 - d1) * dmf >> 5;
  68     dct4x4[2][0] = (d2 + d3) * dmf >> 5;
  69     dct4x4[3][0] = (d2 - d3) * dmf >> 5;
  70 }
  71
  72 static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
  73 {
  74     IDCT_DEQUANT_2X2_START
  75     dct[0] = (d0 + d1) * dmf >> 5;
  76     dct[1] = (d0 - d1) * dmf >> 5;
  77     dct[2] = (d2 + d3) * dmf >> 5;
  78     dct[3] = (d2 - d3) * dmf >> 5;
  79 }
  80 #undef IDCT_2X2_DEQUANT_START
  81
  82 static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
  83 {
  84     int d0 = dct4x4[0][0] + dct4x4[1][0];
  85     int d1 = dct4x4[2][0] + dct4x4[3][0];
  86     int d2 = dct4x4[0][0] - dct4x4[1][0];
  87     int d3 = dct4x4[2][0] - dct4x4[3][0];
  88     d[0] = d0 + d1;
  89     d[2] = d2 + d3;
  90     d[1] = d0 - d1;
  91     d[3] = d2 - d3;
  92     dct4x4[0][0] = 0;
  93     dct4x4[1][0] = 0;
  94     dct4x4[2][0] = 0;
  95     dct4x4[3][0] = 0;
  96 }
  97
  98 static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
  99 {
 100     if( WORD_SIZE == 8 )
 101     {
 102         for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
 103             if( M64( &v[i] ) )
 104                 return 1;
 105     }
 106     else
 107     {
 108         for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
 109             if( M32( &v[i] ) )
 110                 return 1;
 111     }
 112     return 0;
 113 }
 114
 115 /* All encoding functions must output the correct CBP and NNZ values.
 116  * The entropy coding functions will check CBP first, then NNZ, before
 117  * actually reading the DCT coefficients.  NNZ still must be correct even
 118  * if CBP is zero because of the use of NNZ values for context selection.
 119  * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
 120  * that is only needed in CAVLC, and will be calculated by CAVLC's residual
 121  * coding and stored as necessary. */
 122
 123 /* This means that decimation can be done merely by adjusting the CBP and NNZ
 124  * rather than memsetting the coefficients. */
 125
 126 static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
 127 {
 128     pixel *p_src = h->mb.pic.p_fenc[p];
 129     pixel *p_dst = h->mb.pic.p_fdec[p];
 130
 131     ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
 132     ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
 133
 134     int nz, block_cbp = 0;
 135     int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
 136     int i_quant_cat = p ? CQM_4IC : CQM_4IY;
 137     int i_mode = h->mb.i_intra16x16_pred_mode;
 138
 139     if( h->mb.b_lossless )
 140         x264_predict_lossless_16x16( h, p, i_mode );
 141     else
 142         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
 143
 144     if( h->mb.b_lossless )
 145     {
 146         for( int i = 0; i < 16; i++ )
 147         {
 148             int oe = block_idx_xy_fenc[i];
 149             int od = block_idx_xy_fdec[i];
 150             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
 151             h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
 152             block_cbp |= nz;
 153         }
 154         h->mb.i_cbp_luma |= block_cbp * 0xf;
 155         h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
 156         h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
 157         return;
 158     }
 159
 160     CLEAR_16x16_NNZ( p );
 161
 162     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
 163
 164     if( h->mb.b_noise_reduction )
 165         for( int idx = 0; idx < 16; idx++ )
 166             h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
 167
 168     for( int idx = 0; idx < 16; idx++ )
 169     {
 170         dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
 171         dct4x4[idx][0] = 0;
 172     }
 173
 174     if( h->mb.b_trellis )
 175     {
 176         for( int idx = 0; idx < 16; idx++ )
 177             if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
 178             {
 179                 block_cbp = 0xf;
 180                 h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
 181                 h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
 182                 if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
 183                 h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
 184             }
 185     }
 186     else
 187     {
 188         for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 189         {
 190             nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
 191             if( nz )
 192             {
 193                 block_cbp = 0xf;
 194                 FOREACH_BIT( idx, i8x8*4, nz )
 195                 {
 196                     h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
 197                     h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
 198                     if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
 199                     h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
 200                 }
 201             }
 202         }
 203     }
 204
 205     /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
 206     /* More useful with CAVLC, but still useful with CABAC. */
 207     if( decimate_score < 6 )
 208     {
 209         CLEAR_16x16_NNZ( p );
 210         block_cbp = 0;
 211     }
 212     else
 213         h->mb.i_cbp_luma |= block_cbp;
 214
 215     h->dctf.dct4x4dc( dct_dc4x4 );
 216     if( h->mb.b_trellis )
 217         nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
 218     else
 219         nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
 220
 221     h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
 222     if( nz )
 223     {
 224         h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
 225
 226         /* output samples to fdec */
 227         h->dctf.idct4x4dc( dct_dc4x4 );
 228         h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp );  /* XXX not inversed */
 229         if( block_cbp )
 230             for( int i = 0; i < 16; i++ )
 231                 dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
 232     }
 233
 234     /* put pixels to fdec */
 235     if( block_cbp )
 236         h->dctf.add16x16_idct( p_dst, dct4x4 );
 237     else if( nz )
 238         h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
 239 }
 240
 241 /* Round down coefficients losslessly in DC-only chroma blocks.
 242  * Unlike luma blocks, this can't be done with a lookup table or
 243  * other shortcut technique because of the interdependencies
 244  * between the coefficients due to the chroma DC transform. */
 245 static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
 246 {
 247     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
 248
 249     /* If the QP is too high, there's no benefit to rounding optimization. */
 250     if( dmf > 32*64 )
 251         return 1;
 252
 253     if( chroma422 )
 254         return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
 255     else
 256         return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
 257 }
 258
 259 static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
 260 {
 261     int nz, nz_dc;
 262     int b_decimate = b_inter && h->mb.b_dct_decimate;
 263     int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
 264     ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
 265     h->mb.i_cbp_chroma = 0;
 266     h->nr_count[2] += h->mb.b_noise_reduction * 4;
 267
 268     M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
 269     M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
 270     M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
 271     M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
 272     if( chroma422 )
 273     {
 274         M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
 275         M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
 276         M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
 277         M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
 278     }
 279
 280     /* Early termination: check variance of chroma residual before encoding.
 281      * Don't bother trying early termination at low QPs.
 282      * Values are experimentally derived. */
 283     if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
 284     {
 285         int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
 286         int ssd[2];
 287         int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
 288
 289         int score  = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
 290         if( score < thresh*4 )
 291             score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
 292         if( score < thresh*4 )
 293         {
 294             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
 295             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
 296
 297             for( int ch = 0; ch < 2; ch++ )
 298             {
 299                 if( ssd[ch] > thresh )
 300                 {
 301                     pixel *p_src = h->mb.pic.p_fenc[1+ch];
 302                     pixel *p_dst = h->mb.pic.p_fdec[1+ch];
 303
 304                     if( chroma422 )
 305                         /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
 306                         h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
 307                     else
 308                         h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
 309
 310                     if( h->mb.b_trellis )
 311                         nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
 312                     else
 313                     {
 314                         nz_dc = 0;
 315                         for( int i = 0; i <= chroma422; i++ )
 316                             nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
 317                                                              h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
 318                     }
 319
 320                     if( nz_dc )
 321                     {
 322                         if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
 323                             continue;
 324                         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
 325                         if( chroma422 )
 326                         {
 327                             zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 328                             h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
 329                         }
 330                         else
 331                         {
 332                             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 333                             idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
 334                         }
 335
 336                         for( int i = 0; i <= chroma422; i++ )
 337                             h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
 338                         h->mb.i_cbp_chroma = 1;
 339                     }
 340                 }
 341             }
 342             return;
 343         }
 344     }
 345
 346     for( int ch = 0; ch < 2; ch++ )
 347     {
 348         pixel *p_src = h->mb.pic.p_fenc[1+ch];
 349         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
 350         int i_decimate_score = b_decimate ? 0 : 7;
 351         int nz_ac = 0;
 352
 353         ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
 354
 355         if( h->mb.b_lossless )
 356         {
 357             static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
 358
 359             for( int i = 0; i < (chroma422?8:4); i++ )
 360             {
 361                 int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
 362                 int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
 363                 nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
 364                                            &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
 365                 h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
 366                 h->mb.i_cbp_chroma |= nz;
 367             }
 368             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
 369             continue;
 370         }
 371
 372         for( int i = 0; i <= chroma422; i++ )
 373             h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
 374
 375         if( h->mb.b_noise_reduction )
 376             for( int i = 0; i < (chroma422?8:4); i++ )
 377                 h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
 378
 379         if( chroma422 )
 380             h->dctf.dct2x4dc( dct_dc, dct4x4 );
 381         else
 382             dct2x2dc( dct_dc, dct4x4 );
 383
 384         /* calculate dct coeffs */
 385         for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ )
 386         {
 387             if( h->mb.b_trellis )
 388             {
 389                 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
 390                 {
 391                     if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) )
 392                     {
 393                         int idx = 16+ch*16+i8x8*8+i4x4;
 394                         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
 395                         h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
 396                         if( i_decimate_score < 7 )
 397                             i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
 398                         h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
 399                         nz_ac = 1;
 400                     }
 401                 }
 402             }
 403             else
 404             {
 405                 nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp],
 406                                             h->quant4_bias[CQM_4IC+b_inter][i_qp] );
 407                 nz_ac |= nz;
 408
 409                 FOREACH_BIT( i4x4, 0, nz )
 410                 {
 411                     int idx = 16+ch*16+i8x8*8+i4x4;
 412
 413                     h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
 414                     h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
 415                     if( i_decimate_score < 7 )
 416                         i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
 417                     h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
 418                 }
 419             }
 420         }
 421
 422         if( h->mb.b_trellis )
 423             nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
 424         else
 425         {
 426             nz_dc = 0;
 427             for( int i = 0; i <= chroma422; i++ )
 428                 nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
 429                                                  h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
 430         }
 431
 432         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
 433
 434         if( i_decimate_score < 7 || !nz_ac )
 435         {
 436             /* Decimate the block */
 437             M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
 438             M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
 439             if( chroma422 )
 440             {
 441                 M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
 442                 M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
 443             }
 444
 445             if( !nz_dc ) /* Whole block is empty */
 446                 continue;
 447             if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
 448             {
 449                 h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
 450                 continue;
 451             }
 452             /* DC-only */
 453             if( chroma422 )
 454             {
 455                 zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 456                 h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
 457             }
 458             else
 459             {
 460                 zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 461                 idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
 462             }
 463
 464             for( int i = 0; i <= chroma422; i++ )
 465                 h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
 466         }
 467         else
 468         {
 469             h->mb.i_cbp_chroma = 1;
 470
 471             if( nz_dc )
 472             {
 473                 if( chroma422 )
 474                 {
 475                     zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 476                     h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
 477                 }
 478                 else
 479                 {
 480                     zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 481                     idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
 482                 }
 483             }
 484
 485             for( int i = 0; i <= chroma422; i++ )
 486                 h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
 487         }
 488     }
 489
 490     /* 0 = none, 1 = DC only, 2 = DC+AC */
 491     h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] |
 492                            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
 493 }
 494
 495 void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
 496 {
 497     if( CHROMA_FORMAT == CHROMA_420 )
 498         x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
 499     else
 500         x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
 501 }
 502
 503 static void x264_macroblock_encode_skip( x264_t *h )
 504 {
 505     M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
 506     M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;
 507     M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;
 508     M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;
 509     M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0;
 510     M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
 511     M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
 512     M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
 513     if( CHROMA_FORMAT >= CHROMA_422 )
 514     {
 515         M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
 516         M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
 517         M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0;
 518         M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0;
 519     }
 520     h->mb.i_cbp_luma = 0;
 521     h->mb.i_cbp_chroma = 0;
 522     h->mb.cbp[h->mb.i_mb_xy] = 0;
 523 }
 524
 525 /*****************************************************************************
 526  * Intra prediction for predictive lossless mode.
 527  *****************************************************************************/
 528
 529 void x264_predict_lossless_chroma( x264_t *h, int i_mode )
 530 {
 531     int height = 16 >> CHROMA_V_SHIFT;
 532     if( i_mode == I_PRED_CHROMA_V )
 533     {
 534         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
 535         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
 536         memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
 537         memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
 538     }
 539     else if( i_mode == I_PRED_CHROMA_H )
 540     {
 541         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
 542         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
 543         x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
 544         x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
 545         if( CHROMA_FORMAT == CHROMA_422 )
 546         {
 547             x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
 548             x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
 549         }
 550     }
 551     else
 552     {
 553         h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
 554         h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
 555     }
 556 }
 557
 558 void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode )
 559 {
 560     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 561     pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
 562
 563     if( i_mode == I_PRED_4x4_V )
 564         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
 565     else if( i_mode == I_PRED_4x4_H )
 566         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
 567     else
 568         h->predict_4x4[i_mode]( p_dst );
 569 }
 570
 571 void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] )
 572 {
 573     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 574     pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
 575
 576     if( i_mode == I_PRED_8x8_V )
 577         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
 578     else if( i_mode == I_PRED_8x8_H )
 579         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
 580     else
 581         h->predict_8x8[i_mode]( p_dst, edge );
 582 }
 583
 584 void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
 585 {
 586     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 587     if( i_mode == I_PRED_16x16_V )
 588         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
 589     else if( i_mode == I_PRED_16x16_H )
 590         h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
 591     else
 592         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
 593 }
 594
 595 /*****************************************************************************
 596  * x264_macroblock_encode:
 597  *****************************************************************************/
 598 static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
 599 {
 600     int i_qp = h->mb.i_qp;
 601     int b_decimate = h->mb.b_dct_decimate;
 602     int b_force_no_skip = 0;
 603     int nz;
 604     h->mb.i_cbp_luma = 0;
 605     for( int p = 0; p < plane_count; p++ )
 606         h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
 607
 608     if( h->mb.i_type == I_PCM )
 609     {
 610         /* if PCM is chosen, we need to store reconstructed frame data */
 611         for( int p = 0; p < plane_count; p++ )
 612             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
 613         if( chroma )
 614         {
 615             int height = 16 >> CHROMA_V_SHIFT;
 616             h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
 617             h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
 618         }
 619         return;
 620     }
 621
 622     if( !h->mb.b_allow_skip )
 623     {
 624         b_force_no_skip = 1;
 625         if( IS_SKIP(h->mb.i_type) )
 626         {
 627             if( h->mb.i_type == P_SKIP )
 628                 h->mb.i_type = P_L0;
 629             else if( h->mb.i_type == B_SKIP )
 630                 h->mb.i_type = B_DIRECT;
 631         }
 632     }
 633
 634     if( h->mb.i_type == P_SKIP )
 635     {
 636         /* don't do pskip motion compensation if it was already done in macroblock_analyse */
 637         if( !h->mb.b_skip_mc )
 638         {
 639             int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
 640                                   h->mb.mv_min[0], h->mb.mv_max[0] );
 641             int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
 642                                   h->mb.mv_min[1], h->mb.mv_max[1] );
 643
 644             for( int p = 0; p < plane_count; p++ )
 645                 h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
 646                                &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
 647                                mvx, mvy, 16, 16, &h->sh.weight[0][p] );
 648
 649             if( chroma )
 650             {
 651                 int v_shift = CHROMA_V_SHIFT;
 652                 int height = 16 >> v_shift;
 653
 654                 /* Special case for mv0, which is (of course) very common in P-skip mode. */
 655                 if( mvx | mvy )
 656                     h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
 657                                      h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
 658                                      mvx, 2*mvy>>v_shift, 8, height );
 659                 else
 660                     h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
 661                                                          h->mb.pic.i_stride[1], height );
 662
 663                 if( h->sh.weight[0][1].weightfn )
 664                     h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
 665                                                        h->mb.pic.p_fdec[1], FDEC_STRIDE,
 666                                                        &h->sh.weight[0][1], height );
 667                 if( h->sh.weight[0][2].weightfn )
 668                     h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
 669                                                        h->mb.pic.p_fdec[2], FDEC_STRIDE,
 670                                                        &h->sh.weight[0][2], height );
 671             }
 672         }
 673
 674         x264_macroblock_encode_skip( h );
 675         return;
 676     }
 677     if( h->mb.i_type == B_SKIP )
 678     {
 679         /* don't do bskip motion compensation if it was already done in macroblock_analyse */
 680         if( !h->mb.b_skip_mc )
 681             x264_mb_mc( h );
 682         x264_macroblock_encode_skip( h );
 683         return;
 684     }
 685
 686     if( h->mb.i_type == I_16x16 )
 687     {
 688         h->mb.b_transform_8x8 = 0;
 689
 690         for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 691             x264_mb_encode_i16x16( h, p, i_qp );
 692     }
 693     else if( h->mb.i_type == I_8x8 )
 694     {
 695         h->mb.b_transform_8x8 = 1;
 696         /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
 697         if( h->mb.i_skip_intra )
 698         {
 699             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
 700             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
 701             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
 702             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
 703             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
 704             h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
 705             /* In RD mode, restore the now-overwritten DCT data. */
 706             if( h->mb.i_skip_intra == 2 )
 707                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
 708         }
 709         for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 710         {
 711             for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
 712             {
 713                 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 714                 x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
 715             }
 716         }
 717     }
 718     else if( h->mb.i_type == I_4x4 )
 719     {
 720         h->mb.b_transform_8x8 = 0;
 721         /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
 722         if( h->mb.i_skip_intra )
 723         {
 724             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
 725             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
 726             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
 727             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
 728             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
 729             h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
 730             /* In RD mode, restore the now-overwritten DCT data. */
 731             if( h->mb.i_skip_intra == 2 )
 732                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
 733         }
 734         for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 735         {
 736             for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
 737             {
 738                 pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
 739                 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 740
 741                 if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 742                     /* emulate missing topright samples */
 743                     MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
 744
 745                 x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
 746             }
 747         }
 748     }
 749     else    /* Inter MB */
 750     {
 751         int i_decimate_mb = 0;
 752
 753         /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
 754         if( !h->mb.b_skip_mc )
 755             x264_mb_mc( h );
 756
 757         if( h->mb.b_lossless )
 758         {
 759             if( h->mb.b_transform_8x8 )
 760                 for( int p = 0; p < plane_count; p++ )
 761                     for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 762                     {
 763                         int x = i8x8&1;
 764                         int y = i8x8>>1;
 765                         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
 766                                                                            h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
 767                         STORE_8x8_NNZ( p, i8x8, nz );
 768                         h->mb.i_cbp_luma |= nz << i8x8;
 769                     }
 770             else
 771                 for( int p = 0; p < plane_count; p++ )
 772                     for( int i4x4 = 0; i4x4 < 16; i4x4++ )
 773                     {
 774                         nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
 775                                                  h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
 776                                                  h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
 777                         h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
 778                         h->mb.i_cbp_luma |= nz << (i4x4>>2);
 779                     }
 780         }
 781         else if( h->mb.b_transform_8x8 )
 782         {
 783             ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
 784             b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
 785
 786             for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 787             {
 788                 CLEAR_16x16_NNZ( p );
 789                 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
 790                 h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
 791
 792                 int plane_cbp = 0;
 793                 for( int idx = 0; idx < 4; idx++ )
 794                 {
 795                     nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );
 796
 797                     if( nz )
 798                     {
 799                         h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
 800                         if( b_decimate )
 801                         {
 802                             int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
 803                             i_decimate_mb += i_decimate_8x8;
 804                             if( i_decimate_8x8 >= 4 )
 805                                 plane_cbp |= 1<<idx;
 806                         }
 807                         else
 808                             plane_cbp |= 1<<idx;
 809                     }
 810                 }
 811
 812                 if( i_decimate_mb >= 6 || !b_decimate )
 813                 {
 814                     h->mb.i_cbp_luma |= plane_cbp;
 815                     FOREACH_BIT( idx, 0, plane_cbp )
 816                     {
 817                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
 818                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );
 819                         STORE_8x8_NNZ( p, idx, 1 );
 820                     }
 821                 }
 822             }
 823         }
 824         else
 825         {
 826             ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
 827             for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 828             {
 829                 CLEAR_16x16_NNZ( p );
 830                 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
 831
 832                 if( h->mb.b_noise_reduction )
 833                 {
 834                     h->nr_count[0+!!p*2] += 16;
 835                     for( int idx = 0; idx < 16; idx++ )
 836                         h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
 837                 }
 838
 839                 int plane_cbp = 0;
 840                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 841                 {
 842                     int i_decimate_8x8 = b_decimate ? 0 : 6;
 843                     int nnz8x8 = 0;
 844                     if( h->mb.b_trellis )
 845                     {
 846                         for( int i4x4 = 0; i4x4 < 4; i4x4++ )
 847                         {
 848                             int idx = i8x8*4+i4x4;
 849                             if( x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) )
 850                             {
 851                                 h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
 852                                 h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
 853                                 if( i_decimate_8x8 < 6 )
 854                                     i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
 855                                 h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
 856                                 nnz8x8 = 1;
 857                             }
 858                         }
 859                     }
 860                     else
 861                     {
 862                         nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
 863                         if( nz )
 864                         {
 865                             FOREACH_BIT( idx, i8x8*4, nz )
 866                             {
 867                                 h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
 868                                 h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
 869                                 if( i_decimate_8x8 < 6 )
 870                                     i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
 871                                 h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
 872                             }
 873                         }
 874                     }
 875                     if( nnz8x8 )
 876                     {
 877                         i_decimate_mb += i_decimate_8x8;
 878                         if( i_decimate_8x8 < 4 )
 879                             STORE_8x8_NNZ( p, i8x8, 0 );
 880                         else
 881                             plane_cbp |= 1<<i8x8;
 882                     }
 883                 }
 884
 885                 if( i_decimate_mb < 6 )
 886                 {
 887                     plane_cbp = 0;
 888                     CLEAR_16x16_NNZ( p );
 889                 }
 890                 else
 891                 {
 892                     h->mb.i_cbp_luma |= plane_cbp;
 893                     FOREACH_BIT( i8x8, 0, plane_cbp )
 894                     {
 895                         h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
 896                     }
 897                 }
 898             }
 899         }
 900     }
 901
 902     /* encode chroma */
 903     if( chroma )
 904     {
 905         if( IS_INTRA( h->mb.i_type ) )
 906         {
 907             int i_mode = h->mb.i_chroma_pred_mode;
 908             if( h->mb.b_lossless )
 909                 x264_predict_lossless_chroma( h, i_mode );
 910             else
 911             {
 912                 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
 913                 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
 914             }
 915         }
 916
 917         /* encode the 8x8 blocks */
 918         x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
 919     }
 920     else
 921         h->mb.i_cbp_chroma = 0;
 922
 923     /* store cbp */
 924     int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
 925     if( h->param.b_cabac )
 926         cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC    ]] << 8
 927             |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
 928             |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
 929     h->mb.cbp[h->mb.i_mb_xy] = cbp;
 930
 931     /* Check for P_SKIP
 932      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
 933      *      (if multiple mv give same result)*/
 934     if( !b_force_no_skip )
 935     {
 936         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
 937             !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
 938             M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
 939             && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
 940         {
 941             h->mb.i_type = P_SKIP;
 942         }
 943
 944         /* Check for B_SKIP */
 945         if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
 946         {
 947             h->mb.i_type = B_SKIP;
 948         }
 949     }
 950 }
 951
 952 void x264_macroblock_encode( x264_t *h )
 953 {
 954     if( CHROMA444 )
 955         x264_macroblock_encode_internal( h, 3, 0 );
 956     else
 957         x264_macroblock_encode_internal( h, 1, 1 );
 958 }
 959
 960 /*****************************************************************************
 961  * x264_macroblock_probe_skip:
 962  *  Check if the current MB could be encoded as a [PB]_SKIP
 963  *****************************************************************************/
 964 static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
 965 {
 966     ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
 967     ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
 968     ALIGNED_4( int16_t mvp[2] );
 969     int i_qp = h->mb.i_qp;
 970
 971     for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
 972     {
 973         int quant_cat = p ? CQM_4PC : CQM_4PY;
 974         if( !b_bidir )
 975         {
 976             /* Get the MV */
 977             mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
 978             mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
 979
 980             /* Motion compensation */
 981             h->mc.mc_luma( h->mb.pic.p_fdec[p],    FDEC_STRIDE,
 982                            &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
 983                            mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] );
 984         }
 985
 986         for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
 987         {
 988             int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
 989             int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
 990
 991             h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
 992                                         h->mb.pic.p_fdec[p] + fdec_offset );
 993
 994             if( h->mb.b_noise_reduction )
 995                 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
 996                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
 997
 998             int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
 999             FOREACH_BIT( idx, 0, nz )
1000             {
1001                 h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
1002                 i_decimate_mb += h->quantf.decimate_score16( dctscan );
1003                 if( i_decimate_mb >= 6 )
1004                     return 0;
1005             }
1006         }
1007     }
1008
1009     if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1010     {
1011         i_qp = h->mb.i_chroma_qp;
1012         int chroma422 = chroma == CHROMA_422;
1013         int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
1014         int ssd;
1015         ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
1016
1017         if( !b_bidir )
1018         {
1019             /* Special case for mv0, which is (of course) very common in P-skip mode. */
1020             if( M32( mvp ) )
1021                 h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
1022                                  h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
1023                                  mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
1024             else
1025                 h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
1026                                                      h->mb.pic.i_stride[1], chroma422?16:8 );
1027         }
1028
1029         for( int ch = 0; ch < 2; ch++ )
1030         {
1031             pixel *p_src = h->mb.pic.p_fenc[1+ch];
1032             pixel *p_dst = h->mb.pic.p_fdec[1+ch];
1033
1034             if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
1035                 h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
1036                                                       h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
1037                                                       &h->sh.weight[0][1+ch], chroma422?16:8 );
1038
1039             /* there is almost never a termination during chroma, but we can't avoid the check entirely */
1040             /* so instead we check SSD and skip the actual check if the score is low enough. */
1041             ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
1042             if( ssd < thresh )
1043                 continue;
1044
1045             /* The vast majority of chroma checks will terminate during the DC check or the higher
1046              * threshold check, so we can save time by doing a DC-only DCT. */
1047             if( h->mb.b_noise_reduction )
1048             {
1049                 for( int i = 0; i <= chroma422; i++ )
1050                     h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
1051
1052                 for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
1053                 {
1054                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1055                     dct_dc[i4x4] = dct4x4[i4x4][0];
1056                     dct4x4[i4x4][0] = 0;
1057                 }
1058             }
1059             else
1060             {
1061                 if( chroma422 )
1062                     h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
1063                 else
1064                     h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
1065             }
1066
1067             for( int i = 0; i <= chroma422; i++ )
1068                 if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
1069                                             h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
1070                     return 0;
1071
1072             /* If there wasn't a termination in DC, we can check against a much higher threshold. */
1073             if( ssd < thresh*4 )
1074                 continue;
1075
1076             if( !h->mb.b_noise_reduction )
1077                 for( int i = 0; i <= chroma422; i++ )
1078                 {
1079                     h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
1080                     dct4x4[i*4+0][0] = 0;
1081                     dct4x4[i*4+1][0] = 0;
1082                     dct4x4[i*4+2][0] = 0;
1083                     dct4x4[i*4+3][0] = 0;
1084                 }
1085
1086             /* calculate dct coeffs */
1087             for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ )
1088             {
1089                 int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1090                 FOREACH_BIT( idx, i8x8*4, nz )
1091                 {
1092                     h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
1093                     i_decimate_mb += h->quantf.decimate_score15( dctscan );
1094                     if( i_decimate_mb >= 7 )
1095                         return 0;
1096                 }
1097             }
1098         }
1099     }
1100
1101     h->mb.b_skip_mc = 1;
1102     return 1;
1103 }
1104
1105 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
1106 {
1107     if( CHROMA_FORMAT == CHROMA_444 )
1108         return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
1109     else if( CHROMA_FORMAT == CHROMA_422 )
1110         return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
1111     else
1112         return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
1113 }
1114
1115 /****************************************************************************
1116  * DCT-domain noise reduction / adaptive deadzone
1117  * from libavcodec
1118  ****************************************************************************/
1119
1120 void x264_noise_reduction_update( x264_t *h )
1121 {
1122     h->nr_offset = h->nr_offset_denoise;
1123     h->nr_residual_sum = h->nr_residual_sum_buf[0];
1124     h->nr_count = h->nr_count_buf[0];
1125     for( int cat = 0; cat < 3 + CHROMA444; cat++ )
1126     {
1127         int dct8x8 = cat&1;
1128         int size = dct8x8 ? 64 : 16;
1129         const uint32_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
1130
1131         if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
1132         {
1133             for( int i = 0; i < size; i++ )
1134                 h->nr_residual_sum[cat][i] >>= 1;
1135             h->nr_count[cat] >>= 1;
1136         }
1137
1138         for( int i = 0; i < size; i++ )
1139             h->nr_offset[cat][i] =
1140                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
1141                  + h->nr_residual_sum[cat][i]/2)
1142               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
1143
1144         /* Don't denoise DC coefficients */
1145         h->nr_offset[cat][0] = 0;
1146     }
1147 }
1148
1149 /*****************************************************************************
1150  * RD only; 4 calls to this do not make up for one macroblock_encode.
1151  * doesn't transform chroma dc.
1152  *****************************************************************************/
1153 static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma )
1154 {
1155     int b_decimate = h->mb.b_dct_decimate;
1156     int i_qp = h->mb.i_qp;
1157     int x = i8&1;
1158     int y = i8>>1;
1159     int nz;
1160     int chroma422 = chroma == CHROMA_422;
1161
1162     h->mb.i_cbp_chroma = 0;
1163     h->mb.i_cbp_luma &= ~(1 << i8);
1164
1165     if( !h->mb.b_skip_mc )
1166         x264_mb_mc_8x8( h, i8 );
1167
1168     if( h->mb.b_lossless )
1169     {
1170         for( int p = 0; p < plane_count; p++ )
1171         {
1172             pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1173             pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1174             int nnz8x8 = 0;
1175             if( h->mb.b_transform_8x8 )
1176             {
1177                 nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec );
1178                 STORE_8x8_NNZ( p, i8, nnz8x8 );
1179             }
1180             else
1181             {
1182                 for( int i4 = i8*4; i4 < i8*4+4; i4++ )
1183                 {
1184                     nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4],
1185                                              h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4],
1186                                              h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] );
1187                     h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz;
1188                     nnz8x8 |= nz;
1189                 }
1190             }
1191             h->mb.i_cbp_luma |= nnz8x8 << i8;
1192         }
1193         if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1194         {
1195             for( int ch = 0; ch < 2; ch++ )
1196             {
1197                 dctcoef dc;
1198                 pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
1199                 pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
1200
1201                 for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
1202                 {
1203                     int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
1204                     nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
1205                     h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
1206                 }
1207             }
1208             h->mb.i_cbp_chroma = 0x02;
1209         }
1210     }
1211     else
1212     {
1213         if( h->mb.b_transform_8x8 )
1214         {
1215             for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
1216             {
1217                 int quant_cat = p ? CQM_8PC : CQM_8PY;
1218                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1219                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1220                 ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
1221
1222                 h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
1223                 int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
1224                 if( nnz8x8 )
1225                 {
1226                     h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 );
1227
1228                     if( b_decimate && !h->mb.b_trellis )
1229                         nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] );
1230
1231                     if( nnz8x8 )
1232                     {
1233                         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
1234                         h->dctf.add8x8_idct8( p_fdec, dct8x8 );
1235                         STORE_8x8_NNZ( p, i8, 1 );
1236                         h->mb.i_cbp_luma |= 1 << i8;
1237                     }
1238                     else
1239                         STORE_8x8_NNZ( p, i8, 0 );
1240                 }
1241                 else
1242                     STORE_8x8_NNZ( p, i8, 0 );
1243             }
1244         }
1245         else
1246         {
1247             for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
1248             {
1249                 int quant_cat = p ? CQM_4PC : CQM_4PY;
1250                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1251                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1252                 int i_decimate_8x8 = b_decimate ? 0 : 4;
1253                 ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
1254                 int nnz8x8 = 0;
1255
1256                 h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
1257                 STORE_8x8_NNZ( p, i8, 0 );
1258
1259                 if( h->mb.b_noise_reduction )
1260                     for( int idx = 0; idx < 4; idx++ )
1261                         h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
1262
1263                 if( h->mb.b_trellis )
1264                 {
1265                     for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1266                     {
1267                         if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) )
1268                         {
1269                             h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
1270                             h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
1271                             if( i_decimate_8x8 < 4 )
1272                                 i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
1273                             h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
1274                             nnz8x8 = 1;
1275                         }
1276                     }
1277                 }
1278                 else
1279                 {
1280                     nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
1281                     if( nz )
1282                     {
1283                         FOREACH_BIT( i4x4, 0, nz )
1284                         {
1285                             h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
1286                             h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
1287                             if( i_decimate_8x8 < 4 )
1288                                 i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
1289                             h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
1290                         }
1291                     }
1292                 }
1293                 if( nnz8x8 )
1294                 {
1295                     /* decimate this 8x8 block */
1296                     if( i_decimate_8x8 < 4 )
1297                         STORE_8x8_NNZ( p, i8, 0 );
1298                     else
1299                     {
1300                         h->dctf.add8x8_idct( p_fdec, dct4x4 );
1301                         h->mb.i_cbp_luma |= 1 << i8;
1302                     }
1303                 }
1304             }
1305         }
1306
1307         if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1308         {
1309             i_qp = h->mb.i_chroma_qp;
1310             for( int ch = 0; ch < 2; ch++ )
1311             {
1312                 ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
1313                 pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
1314                 pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
1315
1316                 for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
1317                 {
1318                     h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
1319
1320                     if( h->mb.b_noise_reduction )
1321                         h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1322                     dct4x4[i4x4][0] = 0;
1323
1324                     if( h->mb.b_trellis )
1325                         nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
1326                     else
1327                         nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1328
1329                     int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
1330                     h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
1331                     if( nz )
1332                     {
1333                         h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
1334                         h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
1335                         h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
1336                     }
1337                 }
1338             }
1339             h->mb.i_cbp_chroma = 0x02;
1340         }
1341     }
1342 }
1343
1344 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1345 {
1346     if( CHROMA444 )
1347         x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
1348     else if( CHROMA_FORMAT == CHROMA_422 )
1349         x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
1350     else
1351         x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
1352 }
1353
1354 /*****************************************************************************
1355  * RD only, luma only (for 4:2:0)
1356  *****************************************************************************/
1357 static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count )
1358 {
1359     int i_qp = h->mb.i_qp;
1360
1361     for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
1362     {
1363         int quant_cat = p ? CQM_4PC : CQM_4PY;
1364         pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
1365         pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]];
1366         int nz;
1367
1368         /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
1369
1370         if( h->mb.b_lossless )
1371         {
1372             nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec );
1373             h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
1374         }
1375         else
1376         {
1377             ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
1378             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1379             nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
1380             h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
1381             if( nz )
1382             {
1383                 h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 );
1384                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp );
1385                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
1386             }
1387         }
1388     }
1389 }
1390
1391 void x264_macroblock_encode_p4x4( x264_t *h, int i8 )
1392 {
1393     if( CHROMA444 )
1394         x264_macroblock_encode_p4x4_internal( h, i8, 3 );
1395     else
1396         x264_macroblock_encode_p4x4_internal( h, i8, 1 );
1397 }