git.sesse.net Git - x264/blob - encoder/macroblock.c

   1 /*****************************************************************************
   2  * macroblock.c: macroblock encoding
   3  *****************************************************************************
   4  * Copyright (C) 2003-2011 x264 project
   5  *
   6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *          Fiona Glaser <fiona@x264.com>
   9  *          Henrik Gramner <hengar-6@student.ltu.se>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  24  *
  25  * This program is also available under a commercial proprietary license.
  26  * For more information, contact us at licensing@x264.com.
  27  *****************************************************************************/
  28
  29 #include "common/common.h"
  30 #include "macroblock.h"
  31
  32 /* These chroma DC functions don't have assembly versions and are only used here. */
  33
  34 #define ZIG(i,y,x) level[i] = dct[x*2+y];
  35 static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
  36 {
  37     ZIG(0,0,0)
  38     ZIG(1,0,1)
  39     ZIG(2,1,0)
  40     ZIG(3,1,1)
  41 }
  42 #undef ZIG
  43
  44 static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
  45 {
  46     level[0] = dct[0];
  47     level[1] = dct[2];
  48     level[2] = dct[1];
  49     level[3] = dct[4];
  50     level[4] = dct[6];
  51     level[5] = dct[3];
  52     level[6] = dct[5];
  53     level[7] = dct[7];
  54 }
  55
  56 #define IDCT_DEQUANT_2X2_START \
  57     int d0 = dct[0] + dct[1]; \
  58     int d1 = dct[2] + dct[3]; \
  59     int d2 = dct[0] - dct[1]; \
  60     int d3 = dct[2] - dct[3]; \
  61     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  62
  63 static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
  64 {
  65     IDCT_DEQUANT_2X2_START
  66     dct4x4[0][0] = (d0 + d1) * dmf >> 5;
  67     dct4x4[1][0] = (d0 - d1) * dmf >> 5;
  68     dct4x4[2][0] = (d2 + d3) * dmf >> 5;
  69     dct4x4[3][0] = (d2 - d3) * dmf >> 5;
  70 }
  71
  72 static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
  73 {
  74     IDCT_DEQUANT_2X2_START
  75     dct[0] = (d0 + d1) * dmf >> 5;
  76     dct[1] = (d0 - d1) * dmf >> 5;
  77     dct[2] = (d2 + d3) * dmf >> 5;
  78     dct[3] = (d2 - d3) * dmf >> 5;
  79 }
  80 #undef IDCT_2X2_DEQUANT_START
  81
  82 static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
  83 {
  84     int d0 = dct4x4[0][0] + dct4x4[1][0];
  85     int d1 = dct4x4[2][0] + dct4x4[3][0];
  86     int d2 = dct4x4[0][0] - dct4x4[1][0];
  87     int d3 = dct4x4[2][0] - dct4x4[3][0];
  88     d[0] = d0 + d1;
  89     d[2] = d2 + d3;
  90     d[1] = d0 - d1;
  91     d[3] = d2 - d3;
  92     dct4x4[0][0] = 0;
  93     dct4x4[1][0] = 0;
  94     dct4x4[2][0] = 0;
  95     dct4x4[3][0] = 0;
  96 }
  97
  98 static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
  99 {
 100     if( WORD_SIZE == 8 )
 101     {
 102         for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
 103             if( M64( &v[i] ) )
 104                 return 1;
 105     }
 106     else
 107     {
 108         for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
 109             if( M32( &v[i] ) )
 110                 return 1;
 111     }
 112     return 0;
 113 }
 114
 115 /* All encoding functions must output the correct CBP and NNZ values.
 116  * The entropy coding functions will check CBP first, then NNZ, before
 117  * actually reading the DCT coefficients.  NNZ still must be correct even
 118  * if CBP is zero because of the use of NNZ values for context selection.
 119  * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
 120  * that is only needed in CAVLC, and will be calculated by CAVLC's residual
 121  * coding and stored as necessary. */
 122
 123 /* This means that decimation can be done merely by adjusting the CBP and NNZ
 124  * rather than memsetting the coefficients. */
 125
 126 static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
 127 {
 128     pixel *p_src = h->mb.pic.p_fenc[p];
 129     pixel *p_dst = h->mb.pic.p_fdec[p];
 130
 131     ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
 132     ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );
 133
 134     int nz, block_cbp = 0;
 135     int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
 136     int i_quant_cat = p ? CQM_4IC : CQM_4IY;
 137     int i_mode = h->mb.i_intra16x16_pred_mode;
 138
 139     if( h->mb.b_lossless )
 140         x264_predict_lossless_16x16( h, p, i_mode );
 141     else
 142         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
 143
 144     if( h->mb.b_lossless )
 145     {
 146         for( int i = 0; i < 16; i++ )
 147         {
 148             int oe = block_idx_xy_fenc[i];
 149             int od = block_idx_xy_fdec[i];
 150             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
 151             h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
 152             block_cbp |= nz;
 153         }
 154         h->mb.i_cbp_luma |= block_cbp * 0xf;
 155         h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
 156         h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
 157         return;
 158     }
 159
 160     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
 161
 162     for( int i = 0; i < 16; i++ )
 163     {
 164         /* copy dc coeff */
 165         if( h->mb.b_noise_reduction )
 166             h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 );
 167         dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
 168         dct4x4[i][0] = 0;
 169
 170         /* quant/scan/dequant */
 171         if( h->mb.b_trellis )
 172             nz = x264_quant_4x4_trellis( h, dct4x4[i], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, i );
 173         else
 174             nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
 175         h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
 176         if( nz )
 177         {
 178             h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+i], dct4x4[i] );
 179             h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[i_quant_cat], i_qp );
 180             if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+i] );
 181             block_cbp = 0xf;
 182         }
 183     }
 184
 185     /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
 186     /* More useful with CAVLC, but still useful with CABAC. */
 187     if( decimate_score < 6 )
 188     {
 189         CLEAR_16x16_NNZ( p );
 190         block_cbp = 0;
 191     }
 192     else
 193         h->mb.i_cbp_luma |= block_cbp;
 194
 195     h->dctf.dct4x4dc( dct_dc4x4 );
 196     if( h->mb.b_trellis )
 197         nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
 198     else
 199         nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
 200
 201     h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
 202     if( nz )
 203     {
 204         h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
 205
 206         /* output samples to fdec */
 207         h->dctf.idct4x4dc( dct_dc4x4 );
 208         h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp );  /* XXX not inversed */
 209         if( block_cbp )
 210             for( int i = 0; i < 16; i++ )
 211                 dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
 212     }
 213
 214     /* put pixels to fdec */
 215     if( block_cbp )
 216         h->dctf.add16x16_idct( p_dst, dct4x4 );
 217     else if( nz )
 218         h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
 219 }
 220
 221 /* Round down coefficients losslessly in DC-only chroma blocks.
 222  * Unlike luma blocks, this can't be done with a lookup table or
 223  * other shortcut technique because of the interdependencies
 224  * between the coefficients due to the chroma DC transform. */
 225 static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
 226 {
 227     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
 228
 229     /* If the QP is too high, there's no benefit to rounding optimization. */
 230     if( dmf > 32*64 )
 231         return 1;
 232
 233     if( chroma422 )
 234         return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
 235     else
 236         return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
 237 }
 238
 239 static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
 240 {
 241     int nz, nz_dc;
 242     int b_decimate = b_inter && h->mb.b_dct_decimate;
 243     int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
 244     ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
 245     h->mb.i_cbp_chroma = 0;
 246     h->nr_count[2] += h->mb.b_noise_reduction * 4;
 247
 248     /* Early termination: check variance of chroma residual before encoding.
 249      * Don't bother trying early termination at low QPs.
 250      * Values are experimentally derived. */
 251     if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
 252     {
 253         int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
 254         int ssd[2];
 255         int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
 256
 257         int score  = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
 258         if( score < thresh*4 )
 259             score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
 260         if( score < thresh*4 )
 261         {
 262             M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
 263             M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
 264             M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
 265             M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
 266             if( chroma422 )
 267             {
 268                 M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
 269                 M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
 270                 M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
 271                 M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
 272             }
 273             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
 274             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
 275
 276             for( int ch = 0; ch < 2; ch++ )
 277             {
 278                 if( ssd[ch] > thresh )
 279                 {
 280                     pixel *p_src = h->mb.pic.p_fenc[1+ch];
 281                     pixel *p_dst = h->mb.pic.p_fdec[1+ch];
 282
 283                     if( chroma422 )
 284                         /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
 285                         h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
 286                     else
 287                         h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
 288
 289                     if( h->mb.b_trellis )
 290                         nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
 291                     else
 292                     {
 293                         nz_dc = 0;
 294                         for( int i = 0; i <= chroma422; i++ )
 295                             nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
 296                                                              h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
 297                     }
 298
 299                     if( nz_dc )
 300                     {
 301                         if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
 302                             continue;
 303                         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
 304                         if( chroma422 )
 305                         {
 306                             zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 307                             h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
 308                         }
 309                         else
 310                         {
 311                             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 312                             idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
 313                         }
 314
 315                         for( int i = 0; i <= chroma422; i++ )
 316                             h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
 317                         h->mb.i_cbp_chroma = 1;
 318                     }
 319                 }
 320             }
 321             return;
 322         }
 323     }
 324
 325     for( int ch = 0; ch < 2; ch++ )
 326     {
 327         pixel *p_src = h->mb.pic.p_fenc[1+ch];
 328         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
 329         int i_decimate_score = 0;
 330         int nz_ac = 0;
 331
 332         ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
 333
 334         if( h->mb.b_lossless )
 335         {
 336             static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
 337
 338             for( int i = 0; i < (chroma422?8:4); i++ )
 339             {
 340                 int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
 341                 int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
 342                 nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
 343                                            &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
 344                 h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
 345                 h->mb.i_cbp_chroma |= nz;
 346             }
 347             h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
 348             continue;
 349         }
 350
 351         for( int i = 0; i <= chroma422; i++ )
 352             h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
 353
 354         if( h->mb.b_noise_reduction )
 355             for( int i = 0; i < (chroma422?8:4); i++ )
 356                 h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
 357
 358         if( chroma422 )
 359             h->dctf.dct2x4dc( dct_dc, dct4x4 );
 360         else
 361             dct2x2dc( dct_dc, dct4x4 );
 362
 363         /* calculate dct coeffs */
 364         for( int i = 0; i < (chroma422?8:4); i++ )
 365         {
 366             if( h->mb.b_trellis )
 367                 nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
 368             else
 369                 nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
 370             h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
 371             if( nz )
 372             {
 373                 nz_ac = 1;
 374                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
 375                 h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
 376                 if( b_decimate )
 377                     i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
 378             }
 379         }
 380
 381         if( h->mb.b_trellis )
 382             nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
 383         else
 384         {
 385             nz_dc = 0;
 386             for( int i = 0; i <= chroma422; i++ )
 387                 nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
 388                                                  h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
 389         }
 390
 391         h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
 392
 393         if( (b_decimate && i_decimate_score < 7) || !nz_ac )
 394         {
 395             /* Decimate the block */
 396             M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
 397             M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
 398             if( chroma422 )
 399             {
 400                 M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
 401                 M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
 402             }
 403
 404             if( !nz_dc ) /* Whole block is empty */
 405                 continue;
 406             if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
 407             {
 408                 h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
 409                 continue;
 410             }
 411             /* DC-only */
 412             if( chroma422 )
 413             {
 414                 zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 415                 h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
 416             }
 417             else
 418             {
 419                 zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 420                 idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
 421             }
 422
 423             for( int i = 0; i <= chroma422; i++ )
 424                 h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
 425         }
 426         else
 427         {
 428             h->mb.i_cbp_chroma = 1;
 429
 430             if( nz_dc )
 431             {
 432                 if( chroma422 )
 433                 {
 434                     zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
 435                     h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
 436                 }
 437                 else
 438                 {
 439                     zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
 440                     idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
 441                 }
 442             }
 443
 444             for( int i = 0; i <= chroma422; i++ )
 445                 h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
 446         }
 447     }
 448
 449     /* 0 = none, 1 = DC only, 2 = DC+AC */
 450     h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] |
 451                            h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
 452 }
 453
 454 void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
 455 {
 456     if( CHROMA_FORMAT == CHROMA_420 )
 457         x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
 458     else
 459         x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
 460 }
 461
 462 static void x264_macroblock_encode_skip( x264_t *h )
 463 {
 464     M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
 465     M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;
 466     M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;
 467     M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;
 468     M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0;
 469     M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
 470     M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
 471     M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
 472     if( CHROMA_FORMAT >= CHROMA_422 )
 473     {
 474         M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
 475         M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
 476         M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0;
 477         M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0;
 478     }
 479     h->mb.i_cbp_luma = 0;
 480     h->mb.i_cbp_chroma = 0;
 481     h->mb.cbp[h->mb.i_mb_xy] = 0;
 482 }
 483
 484 /*****************************************************************************
 485  * Intra prediction for predictive lossless mode.
 486  *****************************************************************************/
 487
 488 void x264_predict_lossless_chroma( x264_t *h, int i_mode )
 489 {
 490     int height = 16 >> CHROMA_V_SHIFT;
 491     if( i_mode == I_PRED_CHROMA_V )
 492     {
 493         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
 494         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
 495         memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
 496         memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
 497     }
 498     else if( i_mode == I_PRED_CHROMA_H )
 499     {
 500         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
 501         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
 502         x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
 503         x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
 504         if( CHROMA_FORMAT == CHROMA_422 )
 505         {
 506             x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
 507             x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
 508         }
 509     }
 510     else
 511     {
 512         h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
 513         h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
 514     }
 515 }
 516
 517 void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode )
 518 {
 519     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 520     pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
 521
 522     if( i_mode == I_PRED_4x4_V )
 523         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
 524     else if( i_mode == I_PRED_4x4_H )
 525         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
 526     else
 527         h->predict_4x4[i_mode]( p_dst );
 528 }
 529
 530 void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] )
 531 {
 532     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 533     pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
 534
 535     if( i_mode == I_PRED_8x8_V )
 536         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
 537     else if( i_mode == I_PRED_8x8_H )
 538         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
 539     else
 540         h->predict_8x8[i_mode]( p_dst, edge );
 541 }
 542
 543 void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
 544 {
 545     int stride = h->fenc->i_stride[p] << MB_INTERLACED;
 546     if( i_mode == I_PRED_16x16_V )
 547         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
 548     else if( i_mode == I_PRED_16x16_H )
 549         h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
 550     else
 551         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
 552 }
 553
 554 /*****************************************************************************
 555  * x264_macroblock_encode:
 556  *****************************************************************************/
 557 static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
 558 {
 559     int i_qp = h->mb.i_qp;
 560     int b_decimate = h->mb.b_dct_decimate;
 561     int b_force_no_skip = 0;
 562     int nz;
 563     h->mb.i_cbp_luma = 0;
 564     for( int p = 0; p < plane_count; p++ )
 565         h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
 566
 567     if( h->mb.i_type == I_PCM )
 568     {
 569         /* if PCM is chosen, we need to store reconstructed frame data */
 570         for( int p = 0; p < plane_count; p++ )
 571             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
 572         if( chroma )
 573         {
 574             int height = 16 >> CHROMA_V_SHIFT;
 575             h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
 576             h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
 577         }
 578         return;
 579     }
 580
 581     if( !h->mb.b_allow_skip )
 582     {
 583         b_force_no_skip = 1;
 584         if( IS_SKIP(h->mb.i_type) )
 585         {
 586             if( h->mb.i_type == P_SKIP )
 587                 h->mb.i_type = P_L0;
 588             else if( h->mb.i_type == B_SKIP )
 589                 h->mb.i_type = B_DIRECT;
 590         }
 591     }
 592
 593     if( h->mb.i_type == P_SKIP )
 594     {
 595         /* don't do pskip motion compensation if it was already done in macroblock_analyse */
 596         if( !h->mb.b_skip_mc )
 597         {
 598             int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
 599                                   h->mb.mv_min[0], h->mb.mv_max[0] );
 600             int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
 601                                   h->mb.mv_min[1], h->mb.mv_max[1] );
 602
 603             for( int p = 0; p < plane_count; p++ )
 604                 h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
 605                                &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
 606                                mvx, mvy, 16, 16, &h->sh.weight[0][p] );
 607
 608             if( chroma )
 609             {
 610                 int v_shift = CHROMA_V_SHIFT;
 611                 int height = 16 >> v_shift;
 612
 613                 /* Special case for mv0, which is (of course) very common in P-skip mode. */
 614                 if( mvx | mvy )
 615                     h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
 616                                      h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
 617                                      mvx, 2*mvy>>v_shift, 8, height );
 618                 else
 619                     h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
 620                                                          h->mb.pic.i_stride[1], height );
 621
 622                 if( h->sh.weight[0][1].weightfn )
 623                     h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
 624                                                        h->mb.pic.p_fdec[1], FDEC_STRIDE,
 625                                                        &h->sh.weight[0][1], height );
 626                 if( h->sh.weight[0][2].weightfn )
 627                     h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
 628                                                        h->mb.pic.p_fdec[2], FDEC_STRIDE,
 629                                                        &h->sh.weight[0][2], height );
 630             }
 631         }
 632
 633         x264_macroblock_encode_skip( h );
 634         return;
 635     }
 636     if( h->mb.i_type == B_SKIP )
 637     {
 638         /* don't do bskip motion compensation if it was already done in macroblock_analyse */
 639         if( !h->mb.b_skip_mc )
 640             x264_mb_mc( h );
 641         x264_macroblock_encode_skip( h );
 642         return;
 643     }
 644
 645     if( h->mb.i_type == I_16x16 )
 646     {
 647         h->mb.b_transform_8x8 = 0;
 648
 649         for( int p = 0; p < plane_count; p++ )
 650         {
 651             x264_mb_encode_i16x16( h, p, i_qp );
 652             i_qp = h->mb.i_chroma_qp;
 653         }
 654     }
 655     else if( h->mb.i_type == I_8x8 )
 656     {
 657         h->mb.b_transform_8x8 = 1;
 658         /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
 659         if( h->mb.i_skip_intra )
 660         {
 661             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
 662             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
 663             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
 664             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
 665             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
 666             h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
 667             /* In RD mode, restore the now-overwritten DCT data. */
 668             if( h->mb.i_skip_intra == 2 )
 669                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
 670         }
 671         for( int p = 0; p < plane_count; p++ )
 672         {
 673             for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0 ; i < 4; i++ )
 674             {
 675                 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 676                 x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
 677             }
 678             i_qp = h->mb.i_chroma_qp;
 679         }
 680     }
 681     else if( h->mb.i_type == I_4x4 )
 682     {
 683         h->mb.b_transform_8x8 = 0;
 684         /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
 685         if( h->mb.i_skip_intra )
 686         {
 687             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
 688             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
 689             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
 690             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
 691             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
 692             h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
 693             /* In RD mode, restore the now-overwritten DCT data. */
 694             if( h->mb.i_skip_intra == 2 )
 695                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
 696         }
 697         for( int p = 0; p < plane_count; p++ )
 698         {
 699             for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0 ; i < 16; i++ )
 700             {
 701                 pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
 702                 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 703
 704                 if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 705                     /* emulate missing topright samples */
 706                     MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
 707
 708                 x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
 709             }
 710             i_qp = h->mb.i_chroma_qp;
 711         }
 712     }
 713     else    /* Inter MB */
 714     {
 715         int i_decimate_mb = 0;
 716
 717         /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
 718         if( !h->mb.b_skip_mc )
 719             x264_mb_mc( h );
 720
 721         if( h->mb.b_lossless )
 722         {
 723             if( h->mb.b_transform_8x8 )
 724                 for( int p = 0; p < plane_count; p++ )
 725                     for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 726                     {
 727                         int x = i8x8&1;
 728                         int y = i8x8>>1;
 729                         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
 730                                                                            h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
 731                         STORE_8x8_NNZ( p, i8x8, nz );
 732                         h->mb.i_cbp_luma |= nz << i8x8;
 733                     }
 734             else
 735                 for( int p = 0; p < plane_count; p++ )
 736                     for( int i4x4 = 0; i4x4 < 16; i4x4++ )
 737                     {
 738                         nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
 739                                                  h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
 740                                                  h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
 741                         h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
 742                         h->mb.i_cbp_luma |= nz << (i4x4>>2);
 743                     }
 744         }
 745         else if( h->mb.b_transform_8x8 )
 746         {
 747             ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
 748             b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
 749
 750             for( int p = 0; p < plane_count; p++ )
 751             {
 752                 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
 753                 h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
 754
 755                 int plane_cbp = 0;
 756                 for( int idx = 0; idx < 4; idx++ )
 757                 {
 758                     nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );
 759
 760                     if( nz )
 761                     {
 762                         h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
 763                         if( b_decimate )
 764                         {
 765                             int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
 766                             i_decimate_mb += i_decimate_8x8;
 767                             if( i_decimate_8x8 >= 4 )
 768                                 plane_cbp |= 1<<idx;
 769                         }
 770                         else
 771                             plane_cbp |= 1<<idx;
 772                     }
 773                 }
 774
 775                 if( i_decimate_mb < 6 && b_decimate )
 776                 {
 777                     plane_cbp = 0;
 778                     CLEAR_16x16_NNZ( p );
 779                 }
 780                 else
 781                 {
 782                     for( int idx = 0; idx < 4; idx++ )
 783                     {
 784                         int x = idx&1;
 785                         int y = idx>>1;
 786
 787                         if( plane_cbp&(1<<idx) )
 788                         {
 789                             h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[p?CQM_8PC:CQM_8PY], i_qp );
 790                             h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
 791                             STORE_8x8_NNZ( p, idx, 1 );
 792                         }
 793                         else
 794                             STORE_8x8_NNZ( p, idx, 0 );
 795                     }
 796                 }
 797                 h->mb.i_cbp_luma |= plane_cbp;
 798                 i_qp = h->mb.i_chroma_qp;
 799             }
 800         }
 801         else
 802         {
 803             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
 804             for( int p = 0; p < plane_count; p++ )
 805             {
 806                 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
 807                 h->nr_count[0+!!p*2] += h->mb.b_noise_reduction * 16;
 808
 809                 int plane_cbp = 0;
 810                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 811                 {
 812                     int i_decimate_8x8 = 0;
 813                     int cbp = 0;
 814
 815                     /* encode one 4x4 block */
 816                     for( int i4x4 = 0; i4x4 < 4; i4x4++ )
 817                     {
 818                         int idx = i8x8 * 4 + i4x4;
 819
 820                         nz = x264_quant_4x4( h, dct4x4[idx], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, idx );
 821                         h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz;
 822
 823                         if( nz )
 824                         {
 825                             h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
 826                             h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[p?CQM_4PC:CQM_4PY], i_qp );
 827                             if( b_decimate && i_decimate_8x8 < 6 )
 828                                 i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
 829                             cbp = 1;
 830                         }
 831                     }
 832
 833                     int x = i8x8&1;
 834                     int y = i8x8>>1;
 835
 836                     /* decimate this 8x8 block */
 837                     i_decimate_mb += i_decimate_8x8;
 838                     if( b_decimate )
 839                     {
 840                         if( i_decimate_8x8 < 4 )
 841                             STORE_8x8_NNZ( p, i8x8, 0 );
 842                         else
 843                             plane_cbp |= 1<<i8x8;
 844                     }
 845                     else if( cbp )
 846                     {
 847                         h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
 848                         plane_cbp |= 1<<i8x8;
 849                     }
 850                 }
 851
 852                 if( b_decimate )
 853                 {
 854                     if( i_decimate_mb < 6 )
 855                     {
 856                         plane_cbp = 0;
 857                         CLEAR_16x16_NNZ( p );
 858                     }
 859                     else
 860                     {
 861                         for( int i8x8 = 0; i8x8 < 4; i8x8++ )
 862                             if( plane_cbp&(1<<i8x8) )
 863                                 h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
 864                     }
 865                 }
 866                 h->mb.i_cbp_luma |= plane_cbp;
 867                 i_qp = h->mb.i_chroma_qp;
 868             }
 869         }
 870     }
 871
 872     /* encode chroma */
 873     if( chroma )
 874     {
 875         if( IS_INTRA( h->mb.i_type ) )
 876         {
 877             int i_mode = h->mb.i_chroma_pred_mode;
 878             if( h->mb.b_lossless )
 879                 x264_predict_lossless_chroma( h, i_mode );
 880             else
 881             {
 882                 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
 883                 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
 884             }
 885         }
 886
 887         /* encode the 8x8 blocks */
 888         x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
 889     }
 890     else
 891         h->mb.i_cbp_chroma = 0;
 892
 893     /* store cbp */
 894     int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
 895     if( h->param.b_cabac )
 896         cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC    ]] << 8
 897             |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
 898             |  h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
 899     h->mb.cbp[h->mb.i_mb_xy] = cbp;
 900
 901     /* Check for P_SKIP
 902      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
 903      *      (if multiple mv give same result)*/
 904     if( !b_force_no_skip )
 905     {
 906         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
 907             !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
 908             M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
 909             && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
 910         {
 911             h->mb.i_type = P_SKIP;
 912         }
 913
 914         /* Check for B_SKIP */
 915         if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
 916         {
 917             h->mb.i_type = B_SKIP;
 918         }
 919     }
 920 }
 921
 922 void x264_macroblock_encode( x264_t *h )
 923 {
 924     if( CHROMA444 )
 925         x264_macroblock_encode_internal( h, 3, 0 );
 926     else
 927         x264_macroblock_encode_internal( h, 1, 1 );
 928 }
 929
 930 /*****************************************************************************
 931  * x264_macroblock_probe_skip:
 932  *  Check if the current MB could be encoded as a [PB]_SKIP
 933  *****************************************************************************/
 934 static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
 935 {
 936     ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
 937     ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
 938     ALIGNED_4( int16_t mvp[2] );
 939     int i_qp = h->mb.i_qp;
 940
 941     for( int p = 0; p < plane_count; p++ )
 942     {
 943         int quant_cat = p ? CQM_4PC : CQM_4PY;
 944         if( !b_bidir )
 945         {
 946             /* Get the MV */
 947             mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
 948             mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
 949
 950             /* Motion compensation */
 951             h->mc.mc_luma( h->mb.pic.p_fdec[p],    FDEC_STRIDE,
 952                            &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
 953                            mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] );
 954         }
 955
 956         for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
 957         {
 958             int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
 959             int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
 960             /* get luma diff */
 961             h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
 962                                         h->mb.pic.p_fdec[p] + fdec_offset );
 963             /* encode one 4x4 block */
 964             for( int i4x4 = 0; i4x4 < 4; i4x4++ )
 965             {
 966                 if( h->mb.b_noise_reduction )
 967                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
 968                 if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ) )
 969                     continue;
 970                 h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
 971                 i_decimate_mb += h->quantf.decimate_score16( dctscan );
 972                 if( i_decimate_mb >= 6 )
 973                     return 0;
 974             }
 975         }
 976         i_qp = h->mb.i_chroma_qp;
 977     }
 978
 979     if( chroma == CHROMA_420 || chroma == CHROMA_422 )
 980     {
 981         i_qp = h->mb.i_chroma_qp;
 982         int chroma422 = chroma == CHROMA_422;
 983         int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
 984         int ssd;
 985         ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
 986
 987         if( !b_bidir )
 988         {
 989             /* Special case for mv0, which is (of course) very common in P-skip mode. */
 990             if( M32( mvp ) )
 991                 h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
 992                                  h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
 993                                  mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
 994             else
 995                 h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
 996                                                      h->mb.pic.i_stride[1], chroma422?16:8 );
 997         }
 998
 999         for( int ch = 0; ch < 2; ch++ )
1000         {
1001             pixel *p_src = h->mb.pic.p_fenc[1+ch];
1002             pixel *p_dst = h->mb.pic.p_fdec[1+ch];
1003
1004             if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
1005                 h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
1006                                                       h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
1007                                                       &h->sh.weight[0][1+ch], chroma422?16:8 );
1008
1009             /* there is almost never a termination during chroma, but we can't avoid the check entirely */
1010             /* so instead we check SSD and skip the actual check if the score is low enough. */
1011             ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
1012             if( ssd < thresh )
1013                 continue;
1014
1015             /* The vast majority of chroma checks will terminate during the DC check or the higher
1016              * threshold check, so we can save time by doing a DC-only DCT. */
1017             if( h->mb.b_noise_reduction )
1018             {
1019                 for( int i = 0; i <= chroma422; i++ )
1020                     h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
1021
1022                 for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
1023                 {
1024                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1025                     dct_dc[i4x4] = dct4x4[i4x4][0];
1026                 }
1027             }
1028             else
1029             {
1030                 if( chroma422 )
1031                     h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
1032                 else
1033                     h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
1034             }
1035
1036             for( int i = 0; i <= chroma422; i++ )
1037                 if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
1038                                             h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
1039                     return 0;
1040
1041             /* If there wasn't a termination in DC, we can check against a much higher threshold. */
1042             if( ssd < thresh*4 )
1043                 continue;
1044
1045             if( !h->mb.b_noise_reduction )
1046                for( int i = 0; i <= chroma422; i++ )
1047                     h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
1048
1049             /* calculate dct coeffs */
1050             for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ )
1051             {
1052                 dct4x4[i4x4][0] = 0;
1053                 if( h->mb.b_noise_reduction )
1054                     h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1055                 if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
1056                     continue;
1057                 h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
1058                 i_decimate_mb += h->quantf.decimate_score15( dctscan );
1059                 if( i_decimate_mb >= 7 )
1060                     return 0;
1061             }
1062         }
1063     }
1064
1065     h->mb.b_skip_mc = 1;
1066     return 1;
1067 }
1068
1069 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
1070 {
1071     if( CHROMA_FORMAT == CHROMA_444 )
1072         return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
1073     else if( CHROMA_FORMAT == CHROMA_422 )
1074         return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
1075     else
1076         return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
1077 }
1078
1079 /****************************************************************************
1080  * DCT-domain noise reduction / adaptive deadzone
1081  * from libavcodec
1082  ****************************************************************************/
1083
1084 void x264_noise_reduction_update( x264_t *h )
1085 {
1086     h->nr_offset = h->nr_offset_denoise;
1087     h->nr_residual_sum = h->nr_residual_sum_buf[0];
1088     h->nr_count = h->nr_count_buf[0];
1089     for( int cat = 0; cat < 3 + CHROMA444; cat++ )
1090     {
1091         int dct8x8 = cat&1;
1092         int size = dct8x8 ? 64 : 16;
1093         const uint16_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
1094
1095         if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
1096         {
1097             for( int i = 0; i < size; i++ )
1098                 h->nr_residual_sum[cat][i] >>= 1;
1099             h->nr_count[cat] >>= 1;
1100         }
1101
1102         for( int i = 0; i < size; i++ )
1103             h->nr_offset[cat][i] =
1104                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
1105                  + h->nr_residual_sum[cat][i]/2)
1106               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
1107
1108         /* Don't denoise DC coefficients */
1109         h->nr_offset[cat][0] = 0;
1110     }
1111 }
1112
1113 /*****************************************************************************
1114  * RD only; 4 calls to this do not make up for one macroblock_encode.
1115  * doesn't transform chroma dc.
1116  *****************************************************************************/
1117 static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma )
1118 {
1119     int b_decimate = h->mb.b_dct_decimate;
1120     int i_qp = h->mb.i_qp;
1121     int x = i8&1;
1122     int y = i8>>1;
1123     int nz;
1124     int chroma422 = chroma == CHROMA_422;
1125
1126     h->mb.i_cbp_chroma = 0;
1127     h->mb.i_cbp_luma &= ~(1 << i8);
1128
1129     if( !h->mb.b_skip_mc )
1130         x264_mb_mc_8x8( h, i8 );
1131
1132     if( h->mb.b_lossless )
1133     {
1134         for( int p = 0; p < plane_count; p++ )
1135         {
1136             pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1137             pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1138             int nnz8x8 = 0;
1139             if( h->mb.b_transform_8x8 )
1140             {
1141                 nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec );
1142                 STORE_8x8_NNZ( p, i8, nnz8x8 );
1143             }
1144             else
1145             {
1146                 for( int i4 = i8*4; i4 < i8*4+4; i4++ )
1147                 {
1148                     nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4],
1149                                              h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4],
1150                                              h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] );
1151                     h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz;
1152                     nnz8x8 |= nz;
1153                 }
1154             }
1155             h->mb.i_cbp_luma |= nnz8x8 << i8;
1156         }
1157         if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1158         {
1159             for( int ch = 0; ch < 2; ch++ )
1160             {
1161                 dctcoef dc;
1162                 pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
1163                 pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
1164
1165                 for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
1166                 {
1167                     int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
1168                     nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
1169                     h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
1170                 }
1171             }
1172             h->mb.i_cbp_chroma = 0x02;
1173         }
1174     }
1175     else
1176     {
1177         if( h->mb.b_transform_8x8 )
1178         {
1179             for( int p = 0; p < plane_count; p++ )
1180             {
1181                 int quant_cat = p ? CQM_8PC : CQM_8PY;
1182                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1183                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1184                 ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
1185                 h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
1186                 int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
1187                 if( nnz8x8 )
1188                 {
1189                     h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 );
1190
1191                     if( b_decimate && !h->mb.b_trellis )
1192                         nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] );
1193
1194                     if( nnz8x8 )
1195                     {
1196                         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
1197                         h->dctf.add8x8_idct8( p_fdec, dct8x8 );
1198                         STORE_8x8_NNZ( p, i8, 1 );
1199                     }
1200                     else
1201                         STORE_8x8_NNZ( p, i8, 0 );
1202                 }
1203                 else
1204                     STORE_8x8_NNZ( p, i8, 0 );
1205                 h->mb.i_cbp_luma |= nnz8x8 << i8;
1206                 i_qp = h->mb.i_chroma_qp;
1207             }
1208         }
1209         else
1210         {
1211             for( int p = 0; p < plane_count; p++ )
1212             {
1213                 int quant_cat = p ? CQM_4PC : CQM_4PY;
1214                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
1215                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
1216                 int i_decimate_8x8 = 0, nnz8x8 = 0;
1217                 ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
1218                 h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
1219                 for( int i4 = 0; i4 < 4; i4++ )
1220                 {
1221                     nz = x264_quant_4x4( h, dct4x4[i4], i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i8*4+i4 );
1222                     h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4]] = nz;
1223                     if( nz )
1224                     {
1225                         h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4], dct4x4[i4] );
1226                         h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[quant_cat], i_qp );
1227                         if( b_decimate )
1228                             i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4] );
1229                         nnz8x8 = 1;
1230                     }
1231                 }
1232
1233                 if( b_decimate && i_decimate_8x8 < 4 )
1234                     nnz8x8 = 0;
1235
1236                 if( nnz8x8 )
1237                     h->dctf.add8x8_idct( p_fdec, dct4x4 );
1238                 else
1239                     STORE_8x8_NNZ( p, i8, 0 );
1240
1241                 h->mb.i_cbp_luma |= nnz8x8 << i8;
1242                 i_qp = h->mb.i_chroma_qp;
1243             }
1244         }
1245
1246         if( chroma == CHROMA_420 || chroma == CHROMA_422 )
1247         {
1248             i_qp = h->mb.i_chroma_qp;
1249             for( int ch = 0; ch < 2; ch++ )
1250             {
1251                 ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] );
1252                 pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
1253                 pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
1254
1255                 for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
1256                 {
1257                     h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
1258
1259                     if( h->mb.b_noise_reduction )
1260                         h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
1261                     dct4x4[i4x4][0] = 0;
1262
1263                     if( h->mb.b_trellis )
1264                         nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
1265                     else
1266                         nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1267
1268                     int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
1269                     h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
1270                     if( nz )
1271                     {
1272                         h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
1273                         h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
1274                         h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
1275                     }
1276                 }
1277             }
1278             h->mb.i_cbp_chroma = 0x02;
1279         }
1280     }
1281 }
1282
1283 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1284 {
1285     if( CHROMA444 )
1286         x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
1287     else if( CHROMA_FORMAT == CHROMA_422 )
1288         x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
1289     else
1290         x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
1291 }
1292
1293 /*****************************************************************************
1294  * RD only, luma only (for 4:2:0)
1295  *****************************************************************************/
1296 static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count )
1297 {
1298     int i_qp = h->mb.i_qp;
1299
1300     for( int p = 0; p < plane_count; p++ )
1301     {
1302         int quant_cat = p ? CQM_4PC : CQM_4PY;
1303         pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
1304         pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]];
1305         int nz;
1306
1307         /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
1308
1309         if( h->mb.b_lossless )
1310         {
1311             nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec );
1312             h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
1313         }
1314         else
1315         {
1316             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
1317             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1318             nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
1319             h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
1320             if( nz )
1321             {
1322                 h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 );
1323                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp );
1324                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
1325             }
1326         }
1327         i_qp = h->mb.i_chroma_qp;
1328     }
1329 }
1330
1331 void x264_macroblock_encode_p4x4( x264_t *h, int i8 )
1332 {
1333     if( CHROMA444 )
1334         x264_macroblock_encode_p4x4_internal( h, i8, 3 );
1335     else
1336         x264_macroblock_encode_p4x4_internal( h, i8, 1 );
1337 }