git.sesse.net Git - x264/blob - encoder/macroblock.c

   1 /*****************************************************************************
   2  * macroblock.c: macroblock encoding
   3  *****************************************************************************
   4  * Copyright (C) 2003-2011 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *          Henrik Gramner <hengar-6@student.ltu.se>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  24  *
  25  * This program is also available under a commercial proprietary license.
  26  * For more information, contact us at licensing@x264.com.
  27  *****************************************************************************/
  28
  29 #include "common/common.h"
  30 #include "macroblock.h"
  31
  32 /* These chroma DC functions don't have assembly versions and are only used here. */
  33
  34 #define ZIG(i,y,x) level[i] = dct[x*2+y];
  35 static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
  36 {
  37     ZIG(0,0,0)
  38     ZIG(1,0,1)
  39     ZIG(2,1,0)
  40     ZIG(3,1,1)
  41 }
  42 #undef ZIG
  43
  44 static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
  45 {
  46     level[0] = dct[0];
  47     level[1] = dct[2];
  48     level[2] = dct[1];
  49     level[3] = dct[4];
  50     level[4] = dct[6];
  51     level[5] = dct[3];
  52     level[6] = dct[5];
  53     level[7] = dct[7];
  54 }
  55
  56 #define IDCT_DEQUANT_2X2_START \
  57     int d0 = dct[0] + dct[1]; \
  58     int d1 = dct[2] + dct[3]; \
  59     int d2 = dct[0] - dct[1]; \
  60     int d3 = dct[2] - dct[3]; \
  61     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  62
  63 static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
  64 {
  65     IDCT_DEQUANT_2X2_START
  66     dct4x4[0][0] = (d0 + d1) * dmf >> 5;
  67     dct4x4[1][0] = (d0 - d1) * dmf >> 5;
  68     dct4x4[2][0] = (d2 + d3) * dmf >> 5;
  69     dct4x4[3][0] = (d2 - d3) * dmf >> 5;
  70 }
  71
  72 static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
  73 {
  74     IDCT_DEQUANT_2X2_START
  75     dct[0] = (d0 + d1) * dmf >> 5;
  76     dct[1] = (d0 - d1) * dmf >> 5;
  77     dct[2] = (d2 + d3) * dmf >> 5;
  78     dct[3] = (d2 - d3) * dmf >> 5;
  79 }
  80 #undef IDCT_2X2_DEQUANT_START
  81
  82 static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
  83 {
  84     int d0 = dct4x4[0][0] + dct4x4[1][0];
  85     int d1 = dct4x4[2][0] + dct4x4[3][0];
  86     int d2 = dct4x4[0][0] - dct4x4[1][0];
  87     int d3 = dct4x4[2][0] - dct4x4[3][0];
  88     d[0] = d0 + d1;
  89     d[2] = d2 + d3;
  90     d[1] = d0 - d1;
  91     d[3] = d2 - d3;
  92     dct4x4[0][0] = 0;
  93     dct4x4[1][0] = 0;
  94     dct4x4[2][0] = 0;
  95     dct4x4[3][0] = 0;
  96 }
  97
  98 static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
  99 {
 100     if( WORD_SIZE == 8 )
 101     {
 102         for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
 103             if( M64( &v[i] ) )
 104                 return 1;
 105     }
 106     else
 107     {
 108         for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
 109             if( M32( &v[i] ) )
 110                 return 1;
 111     }
 112     return 0;
 113 }
 114
 115 static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
 116 {
 117     int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
 118     if( h->mb.b_noise_reduction )
 119         h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
 120     if( h->mb.b_trellis )
 121         return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 );
 122     else
 123         return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
 124 }
 125
 126 static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
 127 {
 128     int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY);
 129     if( h->mb.b_noise_reduction )
 130         h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 );
 131     if( h->mb.b_trellis )
 132         return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 );
 133     else
 134         return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
 135 }
 136
 137 /* All encoding functions must output the correct CBP and NNZ values.
 138  * The entropy coding functions will check CBP first, then NNZ, before
 139  * actually reading the DCT coefficients.  NNZ still must be correct even
 140  * if CBP is zero because of the use of NNZ values for context selection.
 141  * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
 142  * that is only needed in CAVLC, and will be calculated by CAVLC's residual
 143  * coding and stored as necessary. */
 144
 145 /* This means that decimation can be done merely by adjusting the CBP and NNZ
 146  * rather than memsetting the coefficients. */
 147
 148 void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode )
 149 {
 150     int nz;
 151     pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
 152     pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
 153     ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
 154
 155     if( h->mb.b_lossless )
 156         x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode );
 157     else
 158         h->predict_4x4[i_mode]( p_dst );
 159
 160     if( h->mb.b_lossless )
 161     {
 162         nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst );
 163         h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
 164         h->mb.i_cbp_luma |= nz<<(idx>>2);
 165         return;
 166     }
 167
 168     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
 169
 170     nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx );
 171     h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
 172     if( nz )
 173     {
 174         h->mb.i_cbp_luma |= 1<<(idx>>2);
 175         h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 );
 176         h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp );
 177         h->dctf.add4x4_idct( p_dst, dct4x4 );
 178     }
 179 }
 180
 181 #define STORE_8x8_NNZ( p, idx, nz )\
 182 do\
 183 {\
 184     M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\
 185     M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\
 186 } while(0)
 187
 188 #define CLEAR_16x16_NNZ( p ) \
 189 do\
 190 {\
 191     M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 0]] ) = 0;\
 192     M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 2]] ) = 0;\
 193     M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+ 8]] ) = 0;\
 194     M32( &h->mb.cache.non_zero_count[x264_scan8[16*p+10]] ) = 0;\
 195 } while(0)
 196
 197 void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge )
 198 {
 199     int x = idx&1;
 200     int y = idx>>1;
 201     int nz;
 202     pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
 203     pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
 204     ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
 205     ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
 206
 207     if( !edge )
 208     {
 209         h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] );
 210         edge = edge_buf;
 211     }
 212
 213     if( h->mb.b_lossless )
 214         x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge );
 215     else
 216         h->predict_8x8[i_mode]( p_dst, edge );
 217
 218     if( h->mb.b_lossless )
 219     {
 220         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst );
 221         STORE_8x8_NNZ( p, idx, nz );
 222         h->mb.i_cbp_luma |= nz<<idx;
 223         return;
 224     }
 225
 226     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
 227
 228     nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx );
 229     if( nz )
 230     {
 231         h->mb.i_cbp_luma |= 1<<idx;
 232         h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 );
 233         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp );
 234         h->dctf.add8x8_idct8( p_dst, dct8x8 );
 235         STORE_8x8_NNZ( p, idx, 1 );
 236     }
 237     else
 238         STORE_8x8_NNZ( p, idx, 0 );
 239 }
 240
 241 static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
 242 {
 243     pixel *p_src = h->mb.pic.p_fenc[p];
 244     pixel *p_dst = h->mb.pic.p_fdec[p];
 245
 246     ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
 247     ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );
 248
 249     int nz, block_cbp = 0;
 250     int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
 251     int i_quant_cat = p ? CQM_4IC : CQM_4IY;
 252     int i_mode = h->mb.i_intra16x16_pred_mode;
 253
 254     if( h->mb.b_lossless )
 255         x264_predict_lossless_16x16( h, p, i_mode );
 256     else
 257         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
 258
 259     if( h->mb.b_lossless )
 260     {
 261         for( int i = 0; i < 16; i++ )
 262         {
 263             int oe = block_idx_xy_fenc[i];
 264             int od = block_idx_xy_fdec[i];
 265             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
 266             h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
 267             block_cbp |= nz;
 268         }
 269         h->mb.i_cbp_luma |= block_cbp * 0xf;
 270         h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
 271         h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
 272         return;
 273     }
 274
 275     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
 276
 277     for( int i = 0; i < 16; i++ )
 278     {
 279         /* copy dc coeff */
 280         if( h->mb.b_noise_reduction )
 281             h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 );
 282         dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
 283         dct4x4[i][0] = 0;
 284
 285         /* quant/scan/dequant */
 286         if( h->mb.b_trellis )
 287             nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i );
 288         else
 289             nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
 290         h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
 291         if( nz )
 292         {
 293             h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] );
 294             h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp );
 295             if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] );
 296             block_cbp = 0xf;
 297         }
 298     }
 299
 300     /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
 301     /* More useful with CAVLC, but still useful with CABAC. */
 302     if( decimate_score < 6 )
 303     {
 304         CLEAR_16x16_NNZ( p );
 305         block_cbp = 0;
 306     }
 307     else
 308         h->mb.i_cbp_luma |= block_cbp;
 309
 310     h->dctf.dct4x4dc( dct_dc4x4 );
 311     if( h->mb.b_trellis )
 312         nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
 313     else
 314         nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
 315
 316     h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
 317     if( nz )
 318     {
 319         h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
 320
 321         /* output samples to fdec */
 322         h->dctf.idct4x4dc( dct_dc4x4 );
 323         h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp );  /* XXX not inversed */
 324         if( block_cbp )
 325             for( int i = 0; i < 16; i++ )
 326                 dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
 327     }
 328
 329     /* put pixels to fdec */
 330     if( block_cbp )
 331         h->dctf.add16x16_idct( p_dst, dct4x4 );
 332     else if( nz )
 333         h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
 334 }
 335
 336 /* Round down coefficients losslessly in DC-only chroma blocks.
 337  * Unlike luma blocks, this can't be done with a lookup table or
 338  * other shortcut technique because of the interdependencies
 339  * between the coefficients due to the chroma DC transform. */
 340 static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
 341 {
 342     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
 343
 344     /* If the QP is too high, there's no benefit to rounding optimization. */
 345     if( dmf > 32*64 )
 346         return 1;
 347
 348     if( chroma422 )
 349         return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
 350     else
 351         return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
 352 }
 353
 354 static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
 355 {
 356     int nz, nz_dc;
 357     int b_decimate = b_inter && h->mb.b_dct_decimate;
 358     int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
 359     ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
 360     h->mb.i_cbp_chroma = 0;
 361     h->nr_count[2] += h->mb.b_noise_reduction * 4;
 362
 363     /* Early termination: check variance of chroma residual before encoding.
 364      * Don't bother trying early termination at low QPs.
 365      * Values are experimentally derived. */
 366     if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
 367     {
 368         int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
 369         int ssd[2];
 370         int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
 371
 372         int score  = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
 373         if( score < thresh*4 )
 374             score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
 375         if( score < thresh*4 )
 376         {
 377             M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
 378             M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
 379             M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
 380             M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
 381             if( chroma422 )
 382             {
 383                 M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
 384                 M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
 385                 M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
 386                 M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
 387             }
 388             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
 389             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
 390
 391             for( int ch = 0; ch < 2; ch++ )
 392             {
 393                 if( ssd[ch] > thresh )
 394                 {
 395                     pixel *p_src = h->mb.pic.p_fenc[1+ch];
 396                     pixel *p_dst = h->mb.pic.p_fdec[1+ch];
 397
 398                     if( chroma422 )
 399                         /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
 400                         h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
 401                     else
 402                         h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
 403
 404                     if( h->mb.b_trellis )
 405                         nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
 406                     else
 407                     {
 408                         nz_dc = 0;
 409                         for( int i = 0; i <= chroma422; i++ )
 410                             nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
 411                                                              h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
 412                     }
 413
 414                     if( nz_dc )
 415                     {
 416                         if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
 417                             continue;
 418                         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
 419                         if( chroma422 )
 420                         {
 421                             zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 422                             h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
 423                         }
 424                         else
 425                         {
 426                             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 427                             idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
 428                         }
 429
 430                         for( int i = 0; i <= chroma422; i++ )
 431                             h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
 432                         h->mb.i_cbp_chroma = 1;
 433                     }
 434                 }
 435             }
 436             return;
 437         }
 438     }
 439
 440     for( int ch = 0; ch < 2; ch++ )
 441     {
 442         pixel *p_src = h->mb.pic.p_fenc[1+ch];
 443         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
 444         int i_decimate_score = 0;
 445         int nz_ac = 0;
 446
 447         ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
 448
 449         if( h->mb.b_lossless )
 450         {
 451             static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
 452
 453             for( int i = 0; i < (chroma422?8:4); i++ )
 454             {
 455                 int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
 456                 int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
 457                 nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
 458                                            &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
 459                 h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
 460                 h->mb.i_cbp_chroma |= nz;
 461             }
 462             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
 463             continue;
 464         }
 465
 466         for( int i = 0; i <= chroma422; i++ )
 467             h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
 468
 469         if( h->mb.b_noise_reduction )
 470             for( int i = 0; i < (chroma422?8:4); i++ )
 471                 h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
 472
 473         if( chroma422 )
 474             h->dctf.dct2x4dc( dct_dc, dct4x4 );
 475         else
 476             dct2x2dc( dct_dc, dct4x4 );
 477
 478         /* calculate dct coeffs */
 479         for( int i = 0; i < (chroma422?8:4); i++ )
 480         {
 481             if( h->mb.b_trellis )
 482                 nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
 483             else
 484                 nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
 485             h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
 486             if( nz )
 487             {
 488                 nz_ac = 1;
 489                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
 490                 h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
 491                 if( b_decimate )
 492                     i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
 493             }
 494         }
 495
 496         if( h->mb.b_trellis )
 497             nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
 498         else
 499         {
 500             nz_dc = 0;
 501             for( int i = 0; i <= chroma422; i++ )
 502                 nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
 503                                                  h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
 504         }
 505
 506         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
 507
 508         if( (b_decimate && i_decimate_score < 7) || !nz_ac )
 509         {
 510             /* Decimate the block */
 511             M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
 512             M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
 513             if( chroma422 )
 514             {
 515                 M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
 516                 M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
 517             }
 518
 519             if( !nz_dc ) /* Whole block is empty */
 520                 continue;
 521             if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
 522             {
 523                 h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
 524                 continue;
 525             }
 526             /* DC-only */
 527             if( chroma422 )
 528             {
 529                 zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 530                 h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
 531             }
 532             else
 533             {
 534                 zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 535                 idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
 536             }
 537
 538             for( int i = 0; i <= chroma422; i++ )
 539                 h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
 540         }
 541         else
 542         {
 543             h->mb.i_cbp_chroma = 1;
 544
 545             if( nz_dc )
 546             {
 547                 if( chroma422 )
 548                 {
 549                     zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 550                     h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
 551                 }
 552                 else
 553                 {
 554                     zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 555                     idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
 556                 }
 557             }
 558
 559             for( int i = 0; i <= chroma422; i++ )
 560                 h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
 561         }
 562     }
 563
 564     /* 0 = none, 1 = DC only, 2 = DC+AC */
 565     h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] |
 566                            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
 567 }
 568
 569 void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
 570 {
 571     if( CHROMA_FORMAT == CHROMA_420 )
 572         x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
 573     else
 574         x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
 575 }
 576
 577 static void x264_macroblock_encode_skip( x264_t *h )
 578 {
 579     M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
 580     M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;
 581     M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;
 582     M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;
 583     M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0;
 584     M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
 585     M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
 586     M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
 587     if( CHROMA_FORMAT >= CHROMA_422 )
 588     {
 589         M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
 590         M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
 591         M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0;
 592         M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0;
 593     }
 594     h->mb.i_cbp_luma = 0;
 595     h->mb.i_cbp_chroma = 0;
 596     h->mb.cbp[h->mb.i_mb_xy] = 0;
 597 }
 598
 599 /*****************************************************************************
 600  * Intra prediction for predictive lossless mode.
 601  *****************************************************************************/
 602
 603 void x264_predict_lossless_chroma( x264_t *h, int i_mode )
 604 {
 605     int height = 16 >> h->mb.chroma_v_shift;
 606     if( i_mode == I_PRED_CHROMA_V )
 607     {
 608         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
 609         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
 610         memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
 611         memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
 612     }
 613     else if( i_mode == I_PRED_CHROMA_H )
 614     {
 615         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
 616         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
 617         x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
 618         x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
 619         if( CHROMA_FORMAT == CHROMA_422 )
 620         {
 621             x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
 622             x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
 623         }
 624     }
 625     else
 626     {
 627         h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
 628         h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
 629     }
 630 }
 631
 632 void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode )
 633 {
 634     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 635     pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
 636
 637     if( i_mode == I_PRED_4x4_V )
 638         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
 639     else if( i_mode == I_PRED_4x4_H )
 640         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
 641     else
 642         h->predict_4x4[i_mode]( p_dst );
 643 }
 644
 645 void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] )
 646 {
 647     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 648     pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
 649
 650     if( i_mode == I_PRED_8x8_V )
 651         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
 652     else if( i_mode == I_PRED_8x8_H )
 653         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
 654     else
 655         h->predict_8x8[i_mode]( p_dst, edge );
 656 }
 657
 658 void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
 659 {
 660     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 661     if( i_mode == I_PRED_16x16_V )
 662         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
 663     else if( i_mode == I_PRED_16x16_H )
 664         h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
 665     else
 666         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
 667 }
 668
 669 /*****************************************************************************
 670  * x264_macroblock_encode:
 671  *****************************************************************************/
 672 static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
 673 {
 674     int i_qp = h->mb.i_qp;
 675     int b_decimate = h->mb.b_dct_decimate;
 676     int b_force_no_skip = 0;
 677     int nz;
 678     h->mb.i_cbp_luma = 0;
 679     for( int p = 0; p < plane_count; p++ )
 680         h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
 681
 682     if( h->mb.i_type == I_PCM )
 683     {
 684         /* if PCM is chosen, we need to store reconstructed frame data */
 685         for( int p = 0; p < plane_count; p++ )
 686             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
 687         if( chroma )
 688         {
 689             int height = 16 >> h->mb.chroma_v_shift;
 690             h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
 691             h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
 692         }
 693         return;
 694     }
 695
 696     if( !h->mb.b_allow_skip )
 697     {
 698         b_force_no_skip = 1;
 699         if( IS_SKIP(h->mb.i_type) )
 700         {
 701             if( h->mb.i_type == P_SKIP )
 702                 h->mb.i_type = P_L0;
 703             else if( h->mb.i_type == B_SKIP )
 704                 h->mb.i_type = B_DIRECT;
 705         }
 706     }
 707
 708     if( h->mb.i_type == P_SKIP )
 709     {
 710         /* don't do pskip motion compensation if it was already done in macroblock_analyse */
 711         if( !h->mb.b_skip_mc )
 712         {
 713             int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
 714                                   h->mb.mv_min[0], h->mb.mv_max[0] );
 715             int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
 716                                   h->mb.mv_min[1], h->mb.mv_max[1] );
 717
 718             for( int p = 0; p < plane_count; p++ )
 719                 h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
 720                                &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
 721                                mvx, mvy, 16, 16, &h->sh.weight[0][p] );
 722
 723             if( chroma )
 724             {
 725                 int v_shift = h->mb.chroma_v_shift;
 726                 int height = 16 >> v_shift;
 727
 728                 /* Special case for mv0, which is (of course) very common in P-skip mode. */
 729                 if( mvx | mvy )
 730                     h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
 731                                      h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
 732                                      mvx, 2*mvy>>v_shift, 8, height );
 733                 else
 734                     h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
 735                                                          h->mb.pic.i_stride[1], height );
 736
 737                 if( h->sh.weight[0][1].weightfn )
 738                     h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
 739                                                        h->mb.pic.p_fdec[1], FDEC_STRIDE,
 740                                                        &h->sh.weight[0][1], height );
 741                 if( h->sh.weight[0][2].weightfn )
 742                     h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
 743                                                        h->mb.pic.p_fdec[2], FDEC_STRIDE,
 744                                                        &h->sh.weight[0][2], height );
 745             }
 746         }
 747
 748         x264_macroblock_encode_skip( h );
 749         return;
 750     }
 751     if( h->mb.i_type == B_SKIP )
 752     {
 753         /* don't do bskip motion compensation if it was already done in macroblock_analyse */
 754         if( !h->mb.b_skip_mc )
 755             x264_mb_mc( h );
 756         x264_macroblock_encode_skip( h );
 757         return;
 758     }
 759
 760     if( h->mb.i_type == I_16x16 )
 761     {
 762         h->mb.b_transform_8x8 = 0;
 763
 764         for( int p = 0; p < plane_count; p++ )
 765         {
 766             x264_mb_encode_i16x16( h, p, i_qp );
 767             i_qp = h->mb.i_chroma_qp;
 768         }
 769     }
 770     else if( h->mb.i_type == I_8x8 )
 771     {
 772         h->mb.b_transform_8x8 = 1;
 773         /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
 774         if( h->mb.i_skip_intra )
 775         {
 776             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
 777             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
 778             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
 779             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
 780             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
 781             h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
 782             /* In RD mode, restore the now-overwritten DCT data. */
 783             if( h->mb.i_skip_intra == 2 )
 784                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
 785         }
 786         for( int p = 0; p < plane_count; p++ )
 787         {
 788             for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
 789             {
 790                 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 791                 x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL );
 792             }
 793             i_qp = h->mb.i_chroma_qp;
 794         }
 795     }
 796     else if( h->mb.i_type == I_4x4 )
 797     {
 798         h->mb.b_transform_8x8 = 0;
 799         /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
 800         if( h->mb.i_skip_intra )
 801         {
 802             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
 803             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
 804             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
 805             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
 806             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
 807             h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
 808             /* In RD mode, restore the now-overwritten DCT data. */
 809             if( h->mb.i_skip_intra == 2 )
 810                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
 811         }
 812         for( int p = 0; p < plane_count; p++ )
 813         {
 814             for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
 815             {
 816                 pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
 817                 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 818
 819                 if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 820                     /* emulate missing topright samples */
 821                     MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
 822
 823                 x264_mb_encode_i4x4( h, p, i, i_qp, i_mode );
 824             }
 825             i_qp = h->mb.i_chroma_qp;
 826         }
 827     }
 828     else    /* Inter MB */
 829     {
 830         int i_decimate_mb = 0;
 831
 832         /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
 833         if( !h->mb.b_skip_mc )
 834             x264_mb_mc( h );
 835
 836         if( h->mb.b_lossless )
 837         {
 838             if( h->mb.b_transform_8x8 )
 839                 for( int p = 0; p < plane_count; p++ )
 840                     for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 841                     {
 842                         int x = i8x8&1;
 843                         int y = i8x8>>1;
 844                         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
 845                                                                            h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
 846                         STORE_8x8_NNZ( p, i8x8, nz );
 847                         h->mb.i_cbp_luma |= nz << i8x8;
 848                     }
 849             else
 850                 for( int p = 0; p < plane_count; p++ )
 851                     for( int i4x4 = 0; i4x4 < 16; i4x4++ )
 852                     {
 853                         nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
 854                                                  h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
 855                                                  h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
 856                         h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
 857                         h->mb.i_cbp_luma |= nz << (i4x4>>2);
 858                     }
 859         }
 860         else if( h->mb.b_transform_8x8 )
 861         {
 862             ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
 863             b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
 864
 865             for( int p = 0; p < plane_count; p++ )
 866             {
 867                 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
 868                 h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
 869
 870                 int plane_cbp = 0;
 871                 for( int idx = 0; idx < 4; idx++ )
 872                 {
 873                     nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );
 874
 875                     if( nz )
 876                     {
 877                         h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
 878                         if( b_decimate )
 879                         {
 880                             int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
 881                             i_decimate_mb += i_decimate_8x8;
 882                             if( i_decimate_8x8 >= 4 )
 883                                 plane_cbp |= 1<<idx;
 884                         }
 885                         else
 886                             plane_cbp |= 1<<idx;
 887                     }
 888                 }
 889
 890                 if( i_decimate_mb < 6 && b_decimate )
 891                 {
 892                     plane_cbp = 0;
 893                     CLEAR_16x16_NNZ( p );
 894                 }
 895                 else
 896                 {
 897                     for( int idx = 0; idx < 4; idx++ )
 898                     {
 899                         int x = idx&1;
 900                         int y = idx>>1;
 901
 902                         if( plane_cbp&(1<<idx) )
 903                         {
 904                             h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
 905                             h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
 906                             STORE_8x8_NNZ( p, idx, 1 );
 907                         }
 908                         else
 909                             STORE_8x8_NNZ( p, idx, 0 );
 910                     }
 911                 }
 912                 h->mb.i_cbp_luma |= plane_cbp;
 913                 i_qp = h->mb.i_chroma_qp;
 914             }
 915         }
 916         else
 917         {
 918             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
 919             for( int p = 0; p < plane_count; p++ )
 920             {
 921                 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
 922                 h->nr_count[0+!!p*2] += h->mb.b_noise_reduction * 16;
 923
 924                 int plane_cbp = 0;
 925                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 926                 {
 927                     int i_decimate_8x8 = 0;
 928                     int cbp = 0;
 929
 930                     /* encode one 4x4 block */
 931                     for( int i4x4 = 0; i4x4 < 4; i4x4++ )
 932                     {
 933                         int idx = i8x8 * 4 + i4x4;
 934
 935                         nz = x264_quant_4x4( h, dct4x4[idx], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, idx );
 936                         h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
 937
 938                         if( nz )
 939                         {
 940                             h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
 941                             h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
 942                             if( b_decimate && i_decimate_8x8 < 6 )
 943                                 i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
 944                             cbp = 1;
 945                         }
 946                     }
 947
 948                     int x = i8x8&1;
 949                     int y = i8x8>>1;
 950
 951                     /* decimate this 8x8 block */
 952                     i_decimate_mb += i_decimate_8x8;
 953                     if( b_decimate )
 954                     {
 955                         if( i_decimate_8x8 < 4 )
 956                             STORE_8x8_NNZ( p, i8x8, 0 );
 957                         else
 958                             plane_cbp |= 1<<i8x8;
 959                     }
 960                     else if( cbp )
 961                     {
 962                         h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
 963                         plane_cbp |= 1<<i8x8;
 964                     }
 965                 }
 966
 967                 if( b_decimate )
 968                 {
 969                     if( i_decimate_mb < 6 )
 970                     {
 971                         plane_cbp = 0;
 972                         CLEAR_16x16_NNZ( p );
 973                     }
 974                     else
 975                     {
 976                         for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 977                             if( plane_cbp&(1<<i8x8) )
 978                                 h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
 979                     }
 980                 }
 981                 h->mb.i_cbp_luma |= plane_cbp;
 982                 i_qp = h->mb.i_chroma_qp;
 983             }
 984         }
 985     }
 986
 987     /* encode chroma */
 988     if( chroma )
 989     {
 990         if( IS_INTRA( h->mb.i_type ) )
 991         {
 992             int i_mode = h->mb.i_chroma_pred_mode;
 993             if( h->mb.b_lossless )
 994                 x264_predict_lossless_chroma( h, i_mode );
 995             else
 996             {
 997                 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
 998                 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
 999             }
1000         }
1001
1002         /* encode the 8x8 blocks */
1003         x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
1004     }
1005     else
1006         h->mb.i_cbp_chroma = 0;
1007
1008     /* store cbp */
1009     int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
1010     if( h->param.b_cabac )
1011         cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC    ]] << 8
1012             |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
1013             |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
1014     h->mb.cbp[h->mb.i_mb_xy] = cbp;
1015
1016     /* Check for P_SKIP
1017      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
1018      *      (if multiple mv give same result)*/
1019     if( !b_force_no_skip )
1020     {
1021         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
1022             !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
1023             M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
1024             && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
1025         {
1026             h->mb.i_type = P_SKIP;
1027         }
1028
1029         /* Check for B_SKIP */
1030         if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
1031         {
1032             h->mb.i_type = B_SKIP;
1033         }
1034     }
1035 }
1036
1037 void x264_macroblock_encode( x264_t *h )
1038 {
1039     if( CHROMA444 )
1040         x264_macroblock_encode_internal( h, 3, 0 );
1041     else
1042         x264_macroblock_encode_internal( h, 1, 1 );
1043 }
1044
1045 /*****************************************************************************
1046  * x264_macroblock_probe_skip:
1047  *  Check if the current MB could be encoded as a [PB]_SKIP
1048  *****************************************************************************/
1049 static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
1050 {
1051     ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
1052     ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
1053     ALIGNED_4( int16_t mvp[2] );
1054     int i_qp = h->mb.i_qp;
1055
1056     for( int p = 0; p < plane_count; p++ )
1057     {
1058         int quant_cat = p ? CQM_4PC : CQM_4PY;
1059         if( !b_bidir )
1060         {
1061             /* Get the MV */
1062             mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
1063             mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
1064
1065             /* Motion compensation */
1066             h->mc.mc_luma( h->mb.pic.p_fdec[p],    FDEC_STRIDE,
1067                            &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
1068                            mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] );
1069         }
1070
1071         for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
1072         {
1073             int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
1074             int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
1075             /* get luma diff */
1076             h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
1077                                         h->mb.pic.p_fdec[p] + fdec_offset );
1078             /* encode one 4x4 block */
1079             for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1080             {
1081                 if( h->mb.b_noise_reduction )
1082                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
1083                 if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ) )
1084                     continue;
1085                 h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
1086                 i_decimate_mb += h->quantf.decimate_score16( dctscan );
1087                 if( i_decimate_mb >= 6 )
1088                     return 0;
1089             }
1090         }
1091         i_qp = h->mb.i_chroma_qp;
1092     }
1093
1094     if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1095     {
1096         i_qp = h->mb.i_chroma_qp;
1097         int chroma422 = chroma == CHROMA_422;
1098         int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
1099         int ssd;
1100         ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
1101
1102         if( !b_bidir )
1103         {
1104             /* Special case for mv0, which is (of course) very common in P-skip mode. */
1105             if( M32( mvp ) )
1106                 h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
1107                                  h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
1108                                  mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
1109             else
1110                 h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
1111                                                      h->mb.pic.i_stride[1], chroma422?16:8 );
1112         }
1113
1114         for( int ch = 0; ch < 2; ch++ )
1115         {
1116             pixel *p_src = h->mb.pic.p_fenc[1+ch];
1117             pixel *p_dst = h->mb.pic.p_fdec[1+ch];
1118
1119             if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
1120                 h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
1121                                                       h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
1122                                                       &h->sh.weight[0][1+ch], chroma422?16:8 );
1123
1124             /* there is almost never a termination during chroma, but we can't avoid the check entirely */
1125             /* so instead we check SSD and skip the actual check if the score is low enough. */
1126             ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
1127             if( ssd < thresh )
1128                 continue;
1129
1130             /* The vast majority of chroma checks will terminate during the DC check or the higher
1131              * threshold check, so we can save time by doing a DC-only DCT. */
1132             if( h->mb.b_noise_reduction )
1133             {
1134                 for( int i = 0; i <= chroma422; i++ )
1135                     h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
1136
1137                 for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
1138                 {
1139                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1140                     dct_dc[i4x4] = dct4x4[i4x4][0];
1141                 }
1142             }
1143             else
1144             {
1145                 if( chroma422 )
1146                     h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
1147                 else
1148                     h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
1149             }
1150
1151             for( int i = 0; i <= chroma422; i++ )
1152                 if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
1153                                             h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
1154                     return 0;
1155
1156             /* If there wasn't a termination in DC, we can check against a much higher threshold. */
1157             if( ssd < thresh*4 )
1158                 continue;
1159
1160             if( !h->mb.b_noise_reduction )
1161                for( int i = 0; i <= chroma422; i++ )
1162                     h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
1163
1164             /* calculate dct coeffs */
1165             for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ )
1166             {
1167                 dct4x4[i4x4][0] = 0;
1168                 if( h->mb.b_noise_reduction )
1169                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1170                 if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
1171                     continue;
1172                 h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
1173                 i_decimate_mb += h->quantf.decimate_score15( dctscan );
1174                 if( i_decimate_mb >= 7 )
1175                     return 0;
1176             }
1177         }
1178     }
1179
1180     h->mb.b_skip_mc = 1;
1181     return 1;
1182 }
1183
1184 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
1185 {
1186     if( CHROMA_FORMAT == CHROMA_444 )
1187         return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
1188     else if( CHROMA_FORMAT == CHROMA_422 )
1189         return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
1190     else
1191         return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
1192 }
1193
1194 /****************************************************************************
1195  * DCT-domain noise reduction / adaptive deadzone
1196  * from libavcodec
1197  ****************************************************************************/
1198
1199 void x264_noise_reduction_update( x264_t *h )
1200 {
1201     h->nr_offset = h->nr_offset_denoise;
1202     h->nr_residual_sum = h->nr_residual_sum_buf[0];
1203     h->nr_count = h->nr_count_buf[0];
1204     for( int cat = 0; cat < 3 + CHROMA444; cat++ )
1205     {
1206         int dct8x8 = cat&1;
1207         int size = dct8x8 ? 64 : 16;
1208         const uint16_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
1209
1210         if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
1211         {
1212             for( int i = 0; i < size; i++ )
1213                 h->nr_residual_sum[cat][i] >>= 1;
1214             h->nr_count[cat] >>= 1;
1215         }
1216
1217         for( int i = 0; i < size; i++ )
1218             h->nr_offset[cat][i] =
1219                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
1220                  + h->nr_residual_sum[cat][i]/2)
1221               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
1222
1223         /* Don't denoise DC coefficients */
1224         h->nr_offset[cat][0] = 0;
1225     }
1226 }
1227
1228 /*****************************************************************************
1229  * RD only; 4 calls to this do not make up for one macroblock_encode.
1230  * doesn't transform chroma dc.
1231  *****************************************************************************/
1232 static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma )
1233 {
1234     int b_decimate = h->mb.b_dct_decimate;
1235     int i_qp = h->mb.i_qp;
1236     int x = i8&1;
1237     int y = i8>>1;
1238     int nz;
1239     int chroma422 = chroma == CHROMA_422;
1240
1241     h->mb.i_cbp_chroma = 0;
1242     h->mb.i_cbp_luma &= ~(1 << i8);
1243
1244     if( !h->mb.b_skip_mc )
1245         x264_mb_mc_8x8( h, i8 );
1246
1247     if( h->mb.b_lossless )
1248     {
1249         for( int p = 0; p < plane_count; p++ )
1250         {
1251             pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1252             pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1253             int nnz8x8 = 0;
1254             if( h->mb.b_transform_8x8 )
1255             {
1256                 nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec );
1257                 STORE_8x8_NNZ( p, i8, nnz8x8 );
1258             }
1259             else
1260             {
1261                 for( int i4 = i8*4; i4 < i8*4+4; i4++ )
1262                 {
1263                     nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4],
1264                                              h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4],
1265                                              h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] );
1266                     h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz;
1267                     nnz8x8 |= nz;
1268                 }
1269             }
1270             h->mb.i_cbp_luma |= nnz8x8 << i8;
1271         }
1272         if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1273         {
1274             for( int ch = 0; ch < 2; ch++ )
1275             {
1276                 dctcoef dc;
1277                 pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
1278                 pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
1279
1280                 for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
1281                 {
1282                     int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
1283                     nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
1284                     h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
1285                 }
1286             }
1287             h->mb.i_cbp_chroma = 0x02;
1288         }
1289     }
1290     else
1291     {
1292         if( h->mb.b_transform_8x8 )
1293         {
1294             for( int p = 0; p < plane_count; p++ )
1295             {
1296                 int quant_cat = p ? CQM_8PC : CQM_8PY;
1297                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1298                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1299                 ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
1300                 h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
1301                 int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
1302                 if( nnz8x8 )
1303                 {
1304                     h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 );
1305
1306                     if( b_decimate && !h->mb.b_trellis )
1307                         nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] );
1308
1309                     if( nnz8x8 )
1310                     {
1311                         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
1312                         h->dctf.add8x8_idct8( p_fdec, dct8x8 );
1313                         STORE_8x8_NNZ( p, i8, 1 );
1314                     }
1315                     else
1316                         STORE_8x8_NNZ( p, i8, 0 );
1317                 }
1318                 else
1319                     STORE_8x8_NNZ( p, i8, 0 );
1320                 h->mb.i_cbp_luma |= nnz8x8 << i8;
1321                 i_qp = h->mb.i_chroma_qp;
1322             }
1323         }
1324         else
1325         {
1326             for( int p = 0; p < plane_count; p++ )
1327             {
1328                 int quant_cat = p ? CQM_4PC : CQM_4PY;
1329                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1330                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1331                 int i_decimate_8x8 = 0, nnz8x8 = 0;
1332                 ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
1333                 h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
1334                 for( int i4 = 0; i4 < 4; i4++ )
1335                 {
1336                     nz = x264_quant_4x4( h, dct4x4[i4], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i8*4+i4 );
1337                     h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4]] = nz;
1338                     if( nz )
1339                     {
1340                         h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4], dct4x4[i4] );
1341                         h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[quant_cat], i_qp );
1342                         if( b_decimate )
1343                             i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4] );
1344                         nnz8x8 = 1;
1345                     }
1346                 }
1347
1348                 if( b_decimate && i_decimate_8x8 < 4 )
1349                     nnz8x8 = 0;
1350
1351                 if( nnz8x8 )
1352                     h->dctf.add8x8_idct( p_fdec, dct4x4 );
1353                 else
1354                     STORE_8x8_NNZ( p, i8, 0 );
1355
1356                 h->mb.i_cbp_luma |= nnz8x8 << i8;
1357                 i_qp = h->mb.i_chroma_qp;
1358             }
1359         }
1360
1361         if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1362         {
1363             i_qp = h->mb.i_chroma_qp;
1364             for( int ch = 0; ch < 2; ch++ )
1365             {
1366                 ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] );
1367                 pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
1368                 pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
1369
1370                 for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
1371                 {
1372                     h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
1373
1374                     if( h->mb.b_noise_reduction )
1375                         h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1376                     dct4x4[i4x4][0] = 0;
1377
1378                     if( h->mb.b_trellis )
1379                         nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
1380                     else
1381                         nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1382
1383                     int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
1384                     h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
1385                     if( nz )
1386                     {
1387                         h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
1388                         h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
1389                         h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
1390                     }
1391                 }
1392             }
1393             h->mb.i_cbp_chroma = 0x02;
1394         }
1395     }
1396 }
1397
1398 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1399 {
1400     if( CHROMA444 )
1401         x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
1402     else if( CHROMA_FORMAT == CHROMA_422 )
1403         x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
1404     else
1405         x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
1406 }
1407
1408 /*****************************************************************************
1409  * RD only, luma only (for 4:2:0)
1410  *****************************************************************************/
1411 static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count )
1412 {
1413     int i_qp = h->mb.i_qp;
1414
1415     for( int p = 0; p < plane_count; p++ )
1416     {
1417         int quant_cat = p ? CQM_4PC : CQM_4PY;
1418         pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
1419         pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]];
1420         int nz;
1421
1422         /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
1423
1424         if( h->mb.b_lossless )
1425         {
1426             nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec );
1427             h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
1428         }
1429         else
1430         {
1431             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
1432             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1433             nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
1434             h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
1435             if( nz )
1436             {
1437                 h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 );
1438                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp );
1439                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
1440             }
1441         }
1442         i_qp = h->mb.i_chroma_qp;
1443     }
1444 }
1445
1446 void x264_macroblock_encode_p4x4( x264_t *h, int i8 )
1447 {
1448     if( CHROMA444 )
1449         x264_macroblock_encode_p4x4_internal( h, i8, 3 );
1450     else
1451         x264_macroblock_encode_p4x4_internal( h, i8, 1 );
1452 }