git.sesse.net Git - x264/blob - encoder/macroblock.c

   1 /*****************************************************************************
   2  * macroblock.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdio.h>
  25 #include <string.h>
  26
  27 #include "common/common.h"
  28 #include "macroblock.h"
  29
  30
  31 /* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
  32 /* FIXME this seems to make better decisions with cqm=jvt, but could screw up
  33  * with general custom matrices. */
  34 static const int def_quant4_mf[6][4][4] =
  35 {
  36     { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
  37       { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
  38     { { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 },
  39       { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 } },
  40     { { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 },
  41       { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 } },
  42     { {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 },
  43       {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 } },
  44     { {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 },
  45       {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 } },
  46     { {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 },
  47       {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 } }
  48 };
  49
  50 /****************************************************************************
  51  * Scan and Quant functions
  52  ****************************************************************************/
  53
  54 #define ZIG(i,y,x) level[i] = dct[x][y];
  55 static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] )
  56 {
  57     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
  58     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
  59     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)
  60     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)
  61     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)
  62     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)
  63     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)
  64     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)
  65     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)
  66     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)
  67     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)
  68     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)
  69     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)
  70     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)
  71     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)
  72     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
  73 }
  74 static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
  75 {
  76     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
  77     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
  78     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
  79     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
  80 }
  81 static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
  82 {
  83                 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
  84     ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
  85     ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
  86     ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
  87 }
  88 static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
  89 {
  90     ZIG(0,0,0)
  91     ZIG(1,0,1)
  92     ZIG(2,1,0)
  93     ZIG(3,1,1)
  94 }
  95 #undef ZIG
  96
  97 #define ZIG(i,y,x) {\
  98     int oe = x+y*FENC_STRIDE;\
  99     int od = x+y*FDEC_STRIDE;\
 100     level[i] = p_src[oe] - p_dst[od];\
 101     p_dst[od] = p_src[oe];\
 102 }
 103 static inline void sub_zigzag_4x4full( int level[16], const uint8_t *p_src, uint8_t *p_dst )
 104 {
 105     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
 106     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
 107     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
 108     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 109 }
 110 static inline void sub_zigzag_4x4( int level[15], const uint8_t *p_src, uint8_t *p_dst )
 111 {
 112                 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
 113     ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
 114     ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
 115     ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
 116 }
 117 #undef ZIG
 118
 119 static void quant_8x8( x264_t *h, int16_t dct[8][8], int quant_mf[6][8][8], int i_qscale, int b_intra )
 120 {
 121     const int i_qbits = 16 + i_qscale / 6;
 122     const int i_mf = i_qscale % 6;
 123     const int f = ( 1 << (i_qbits + b_intra) ) / 6;
 124     h->quantf.quant_8x8_core( dct, quant_mf[i_mf], i_qbits, f );
 125 }
 126 static void quant_4x4( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
 127 {
 128     const int i_qbits = 15 + i_qscale / 6;
 129     const int i_mf = i_qscale % 6;
 130     const int f = ( 1 << (i_qbits + b_intra) ) / 6;
 131     h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
 132 }
 133 static void quant_4x4_dc( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale )
 134 {
 135     const int i_qbits = 16 + i_qscale / 6;
 136     const int i_mf = i_qscale % 6;
 137     const int f = ( 1 << i_qbits ) / 3;
 138     h->quantf.quant_4x4_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
 139 }
 140 static void quant_2x2_dc( x264_t *h, int16_t dct[2][2], int quant_mf[6][4][4], int i_qscale, int b_intra )
 141 {
 142     const int i_qbits = 16 + i_qscale / 6;
 143     const int i_mf = i_qscale % 6;
 144     const int f = ( 1 << (i_qbits + b_intra) ) / 6;
 145     h->quantf.quant_2x2_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
 146 }
 147
 148 /* (ref: JVT-B118)
 149  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
 150  * to 0 (low score means set it to null)
 151  * Used in inter macroblock (luma and chroma)
 152  *  luma: for a 8x8 block: if score < 4 -> null
 153  *        for the complete mb: if score < 6 -> null
 154  *  chroma: for the complete mb: if score < 7 -> null
 155  */
 156 static int x264_mb_decimate_score( int *dct, int i_max )
 157 {
 158     static const int i_ds_table4[16] = {
 159         3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
 160     static const int i_ds_table8[64] = {
 161         3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
 162         1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
 163         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 164         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
 165
 166     const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
 167     int i_score = 0;
 168     int idx = i_max - 1;
 169
 170     while( idx >= 0 && dct[idx] == 0 )
 171         idx--;
 172
 173     while( idx >= 0 )
 174     {
 175         int i_run;
 176
 177         if( abs( dct[idx--] ) > 1 )
 178             return 9;
 179
 180         i_run = 0;
 181         while( idx >= 0 && dct[idx] == 0 )
 182         {
 183             idx--;
 184             i_run++;
 185         }
 186         i_score += ds_table[i_run];
 187     }
 188
 189     return i_score;
 190 }
 191
 192 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
 193 {
 194     int x = 4 * block_idx_x[idx];
 195     int y = 4 * block_idx_y[idx];
 196     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
 197     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
 198     int16_t dct4x4[4][4];
 199
 200     if( h->mb.b_lossless )
 201     {
 202         sub_zigzag_4x4full( h->dct.block[idx].luma4x4, p_src, p_dst );
 203         return;
 204     }
 205
 206     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
 207
 208     if( h->mb.b_trellis )
 209         x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
 210     else
 211         quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
 212
 213     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
 214     h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
 215
 216     /* output samples to fdec */
 217     h->dctf.add4x4_idct( p_dst, dct4x4 );
 218 }
 219
 220 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
 221 {
 222     int x = 8 * (idx&1);
 223     int y = 8 * (idx>>1);
 224     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
 225     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
 226     int16_t dct8x8[8][8];
 227
 228     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
 229
 230     if( h->mb.b_trellis )
 231         x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
 232     else
 233         quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
 234
 235     scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
 236     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
 237     h->dctf.add8x8_idct8( p_dst, dct8x8 );
 238 }
 239
 240 static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
 241 {
 242     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 243     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 244
 245     int16_t dct4x4[16+1][4][4];
 246
 247     int i;
 248
 249     if( h->mb.b_lossless )
 250     {
 251         for( i = 0; i < 16; i++ )
 252         {
 253             int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
 254             int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
 255             sub_zigzag_4x4( h->dct.block[i].residual_ac, p_src+oe, p_dst+od );
 256             dct4x4[0][block_idx_x[i]][block_idx_y[i]] = p_src[oe] - p_dst[od];
 257             p_dst[od] = p_src[oe];
 258         }
 259         scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
 260         return;
 261     }
 262
 263     h->dctf.sub16x16_dct( &dct4x4[1], p_src, p_dst );
 264     for( i = 0; i < 16; i++ )
 265     {
 266         /* copy dc coeff */
 267         dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
 268
 269         /* quant/scan/dequant */
 270         if( h->mb.b_trellis )
 271             x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
 272         else
 273             quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
 274
 275         scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
 276         h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
 277     }
 278
 279     h->dctf.dct4x4dc( dct4x4[0] );
 280     quant_4x4_dc( h, dct4x4[0], h->quant4_mf[CQM_4IY], i_qscale );
 281     scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
 282
 283     /* output samples to fdec */
 284     h->dctf.idct4x4dc( dct4x4[0] );
 285     x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
 286
 287     /* calculate dct coeffs */
 288     for( i = 0; i < 16; i++ )
 289     {
 290         /* copy dc coeff */
 291         dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
 292     }
 293     /* put pixels to fdec */
 294     h->dctf.add16x16_idct( p_dst, &dct4x4[1] );
 295 }
 296
 297 static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
 298 {
 299     int i, ch;
 300     int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
 301
 302     for( ch = 0; ch < 2; ch++ )
 303     {
 304         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
 305         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 306         int i_decimate_score = 0;
 307
 308         int16_t dct2x2[2][2];
 309         int16_t dct4x4[4][4][4];
 310
 311         if( h->mb.b_lossless )
 312         {
 313             for( i = 0; i < 4; i++ )
 314             {
 315                 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
 316                 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
 317                 sub_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, p_src+oe, p_dst+od );
 318                 h->dct.chroma_dc[ch][i] = p_src[oe] - p_dst[od];
 319                 p_dst[od] = p_src[oe];
 320             }
 321             continue;
 322         }
 323
 324         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
 325         /* calculate dct coeffs */
 326         for( i = 0; i < 4; i++ )
 327         {
 328             /* copy dc coeff */
 329             dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
 330
 331             /* no trellis; it doesn't seem to help chroma noticeably */
 332             quant_4x4( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
 333             scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
 334
 335             if( b_decimate )
 336             {
 337                 i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
 338             }
 339         }
 340
 341         h->dctf.dct2x2dc( dct2x2 );
 342         quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
 343         scan_zigzag_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
 344
 345         /* output samples to fdec */
 346         h->dctf.idct2x2dc( dct2x2 );
 347         x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale );  /* XXX not inversed */
 348
 349         if( b_decimate && i_decimate_score < 7 )
 350         {
 351             /* Near null chroma 8x8 block so make it null (bits saving) */
 352             memset( &h->dct.block[16+ch*4], 0, 4 * sizeof( *h->dct.block ) );
 353             if( !array_non_zero( (int*)dct2x2, sizeof(dct2x2)/sizeof(int) ) )
 354                 continue;
 355             memset( dct4x4, 0, sizeof( dct4x4 ) );
 356         }
 357         else
 358         {
 359             for( i = 0; i < 4; i++ )
 360                 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
 361         }
 362
 363         for( i = 0; i < 4; i++ )
 364             dct4x4[i][0][0] = dct2x2[0][i];
 365         h->dctf.add8x8_idct( p_dst, dct4x4 );
 366     }
 367 }
 368
 369 static void x264_macroblock_encode_skip( x264_t *h )
 370 {
 371     int i;
 372     h->mb.i_cbp_luma = 0x00;
 373     h->mb.i_cbp_chroma = 0x00;
 374
 375     for( i = 0; i < 16+8; i++ )
 376     {
 377         h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
 378     }
 379
 380     /* store cbp */
 381     h->mb.cbp[h->mb.i_mb_xy] = 0;
 382 }
 383
 384 /*****************************************************************************
 385  * x264_macroblock_encode_pskip:
 386  *  Encode an already marked skip block
 387  *****************************************************************************/
 388 void x264_macroblock_encode_pskip( x264_t *h )
 389 {
 390     const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
 391                                 h->mb.mv_min[0], h->mb.mv_max[0] );
 392     const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
 393                                 h->mb.mv_min[1], h->mb.mv_max[1] );
 394
 395     /* Motion compensation XXX probably unneeded */
 396     h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
 397                    h->mb.pic.p_fdec[0],    FDEC_STRIDE,
 398                    mvx, mvy, 16, 16 );
 399
 400     /* Chroma MC */
 401     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
 402                      h->mb.pic.p_fdec[1],       FDEC_STRIDE,
 403                      mvx, mvy, 8, 8 );
 404
 405     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
 406                      h->mb.pic.p_fdec[2],       FDEC_STRIDE,
 407                      mvx, mvy, 8, 8 );
 408
 409     x264_macroblock_encode_skip( h );
 410 }
 411
 412 /*****************************************************************************
 413  * x264_macroblock_encode:
 414  *****************************************************************************/
 415 void x264_macroblock_encode( x264_t *h )
 416 {
 417     int i_cbp_dc = 0;
 418     int i_qp = h->mb.i_qp;
 419     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
 420     int i;
 421
 422     if( h->mb.i_type == P_SKIP )
 423     {
 424         /* A bit special */
 425         x264_macroblock_encode_pskip( h );
 426         return;
 427     }
 428     if( h->mb.i_type == B_SKIP )
 429     {
 430         /* XXX motion compensation is probably unneeded */
 431         x264_mb_mc( h );
 432         x264_macroblock_encode_skip( h );
 433         return;
 434     }
 435
 436     if( h->mb.i_type == I_16x16 )
 437     {
 438         const int i_mode = h->mb.i_intra16x16_pred_mode;
 439         h->mb.b_transform_8x8 = 0;
 440         /* do the right prediction */
 441         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
 442
 443         /* encode the 16x16 macroblock */
 444         x264_mb_encode_i16x16( h, i_qp );
 445     }
 446     else if( h->mb.i_type == I_8x8 )
 447     {
 448         DECLARE_ALIGNED( uint8_t, edge[33], 8 );
 449         h->mb.b_transform_8x8 = 1;
 450         for( i = 0; i < 4; i++ )
 451         {
 452             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
 453             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 454
 455             x264_predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
 456             h->predict_8x8[i_mode]( p_dst, edge );
 457             x264_mb_encode_i8x8( h, i, i_qp );
 458         }
 459     }
 460     else if( h->mb.i_type == I_4x4 )
 461     {
 462         h->mb.b_transform_8x8 = 0;
 463         for( i = 0; i < 16; i++ )
 464         {
 465             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * FDEC_STRIDE];
 466             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 467
 468             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 469                 /* emulate missing topright samples */
 470                 *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
 471
 472             h->predict_4x4[i_mode]( p_dst );
 473             x264_mb_encode_i4x4( h, i, i_qp );
 474         }
 475     }
 476     else    /* Inter MB */
 477     {
 478         int i8x8, i4x4, idx;
 479         int i_decimate_mb = 0;
 480
 481         /* Motion compensation */
 482         x264_mb_mc( h );
 483
 484         if( h->mb.b_lossless )
 485         {
 486             for( i4x4 = 0; i4x4 < 16; i4x4++ )
 487             {
 488                 int x = 4*block_idx_x[i4x4];
 489                 int y = 4*block_idx_y[i4x4];
 490                 sub_zigzag_4x4full( h->dct.block[i4x4].luma4x4,
 491                                     h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
 492                                     h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
 493             }
 494         }
 495         else if( h->mb.b_transform_8x8 )
 496         {
 497             int16_t dct8x8[4][8][8];
 498             int nnz8x8[4] = {1,1,1,1};
 499             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
 500             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
 501
 502             for( idx = 0; idx < 4; idx++ )
 503             {
 504                 if( h->mb.b_noise_reduction )
 505                     x264_denoise_dct( h, (int16_t*)dct8x8[idx] );
 506                 if( h->mb.b_trellis )
 507                     x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
 508                 else
 509                     quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
 510
 511                 scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
 512
 513                 if( b_decimate )
 514                 {
 515                     int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
 516                     i_decimate_mb += i_decimate_8x8;
 517                     if( i_decimate_8x8 < 4 )
 518                     {
 519                         memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) );
 520                         memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) );
 521                         nnz8x8[idx] = 0;
 522                     }
 523                 }
 524                 else
 525                     nnz8x8[idx] = array_non_zero( (int*)dct8x8[idx], sizeof(*dct8x8)/sizeof(int) );
 526             }
 527
 528             if( i_decimate_mb < 6 && b_decimate )
 529                 memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) );
 530             else
 531             {
 532                 for( idx = 0; idx < 4; idx++ )
 533                     if( nnz8x8[idx] )
 534                     {
 535                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
 536                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
 537                     }
 538             }
 539         }
 540         else
 541         {
 542             int16_t dct4x4[16][4][4];
 543             int nnz8x8[4] = {1,1,1,1};
 544             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
 545
 546             for( i8x8 = 0; i8x8 < 4; i8x8++ )
 547             {
 548                 int i_decimate_8x8;
 549
 550                 /* encode one 4x4 block */
 551                 i_decimate_8x8 = 0;
 552                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
 553                 {
 554                     idx = i8x8 * 4 + i4x4;
 555
 556                     if( h->mb.b_noise_reduction )
 557                         x264_denoise_dct( h, (int16_t*)dct4x4[idx] );
 558                     if( h->mb.b_trellis )
 559                         x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
 560                     else
 561                         quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
 562
 563                     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
 564
 565                     if( b_decimate )
 566                         i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
 567                 }
 568
 569                 /* decimate this 8x8 block */
 570                 i_decimate_mb += i_decimate_8x8;
 571                 if( i_decimate_8x8 < 4 && b_decimate )
 572                 {
 573                     memset( &dct4x4[i8x8*4], 0, 4 * sizeof( *dct4x4 ) );
 574                     memset( &h->dct.block[i8x8*4], 0, 4 * sizeof( *h->dct.block ) );
 575                     nnz8x8[i8x8] = 0;
 576                 }
 577             }
 578
 579             if( i_decimate_mb < 6 && b_decimate )
 580                 memset( h->dct.block, 0, 16 * sizeof( *h->dct.block ) );
 581             else
 582             {
 583                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
 584                     if( nnz8x8[i8x8] )
 585                     {
 586                         for( i = 0; i < 4; i++ )
 587                             h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
 588                         h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
 589                     }
 590             }
 591         }
 592     }
 593
 594     /* encode chroma */
 595     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 596     if( IS_INTRA( h->mb.i_type ) )
 597     {
 598         const int i_mode = h->mb.i_chroma_pred_mode;
 599         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
 600         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
 601     }
 602
 603     /* encode the 8x8 blocks */
 604     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qp );
 605
 606     /* Calculate the Luma/Chroma pattern and non_zero_count */
 607     h->mb.i_cbp_luma = 0x00;
 608     if( h->mb.i_type == I_16x16 )
 609     {
 610         for( i = 0; i < 16; i++ )
 611         {
 612             const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
 613             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
 614             if( nz > 0 )
 615                 h->mb.i_cbp_luma = 0x0f;
 616         }
 617     }
 618     else if( h->mb.b_transform_8x8 )
 619     {
 620         /* coded_block_flag is enough for CABAC.
 621          * the full non_zero_count is done only in CAVLC. */
 622         for( i = 0; i < 4; i++ )
 623         {
 624             const int nz = array_non_zero( h->dct.luma8x8[i], 64 );
 625             int j;
 626             for( j = 0; j < 4; j++ )
 627                 h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz;
 628             if( nz > 0 )
 629                 h->mb.i_cbp_luma |= 1 << i;
 630         }
 631     }
 632     else
 633     {
 634         for( i = 0; i < 16; i++ )
 635         {
 636             const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
 637             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
 638             if( nz > 0 )
 639                 h->mb.i_cbp_luma |= 1 << (i/4);
 640         }
 641     }
 642
 643     /* Calculate the chroma pattern */
 644     h->mb.i_cbp_chroma = 0x00;
 645     for( i = 0; i < 8; i++ )
 646     {
 647         const int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
 648         h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
 649         if( nz > 0 )
 650         {
 651             h->mb.i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
 652         }
 653     }
 654     if( h->mb.i_cbp_chroma == 0x00 && array_non_zero( h->dct.chroma_dc[0], 8 ) )
 655     {
 656         h->mb.i_cbp_chroma = 0x01;    /* dc only */
 657     }
 658
 659     if( h->param.b_cabac )
 660     {
 661         i_cbp_dc = ( h->mb.i_type == I_16x16 && array_non_zero( h->dct.luma16x16_dc, 16 ) )
 662                  | array_non_zero( h->dct.chroma_dc[0], 4 ) << 1
 663                  | array_non_zero( h->dct.chroma_dc[1], 4 ) << 2;
 664     }
 665
 666     /* store cbp */
 667     h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
 668
 669     /* Check for P_SKIP
 670      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
 671      *      (if multiple mv give same result)*/
 672     if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
 673         h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
 674         h->mb.cache.ref[0][x264_scan8[0]] == 0 )
 675     {
 676         int mvp[2];
 677
 678         x264_mb_predict_mv_pskip( h, mvp );
 679         if( h->mb.cache.mv[0][x264_scan8[0]][0] == mvp[0] &&
 680             h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
 681         {
 682             h->mb.i_type = P_SKIP;
 683         }
 684     }
 685
 686     /* Check for B_SKIP */
 687     if( h->mb.i_type == B_DIRECT &&
 688         h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
 689     {
 690         h->mb.i_type = B_SKIP;
 691     }
 692 }
 693
 694 /*****************************************************************************
 695  * x264_macroblock_probe_skip:
 696  *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
 697  *  the previous QP
 698  *****************************************************************************/
 699 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 700 {
 701     DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
 702     DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
 703     DECLARE_ALIGNED( int,     dctscan[16], 16 );
 704
 705     int i_qp = h->mb.i_qp;
 706     int mvp[2];
 707     int ch;
 708
 709     int i8x8, i4x4;
 710     int i_decimate_mb;
 711
 712     if( !b_bidir )
 713     {
 714         /* Get the MV */
 715         x264_mb_predict_mv_pskip( h, mvp );
 716         mvp[0] = x264_clip3( mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
 717         mvp[1] = x264_clip3( mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
 718
 719         /* Motion compensation */
 720         h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
 721                        h->mb.pic.p_fdec[0],    FDEC_STRIDE,
 722                        mvp[0], mvp[1], 16, 16 );
 723     }
 724
 725     /* get luma diff */
 726     h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0],
 727                                   h->mb.pic.p_fdec[0] );
 728
 729     for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
 730     {
 731         /* encode one 4x4 block */
 732         for( i4x4 = 0; i4x4 < 4; i4x4++ )
 733         {
 734             const int idx = i8x8 * 4 + i4x4;
 735
 736             quant_4x4( h, dct4x4[idx], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 737             scan_zigzag_4x4full( dctscan, dct4x4[idx] );
 738
 739             i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
 740
 741             if( i_decimate_mb >= 6 )
 742             {
 743                 /* not as P_SKIP */
 744                 return 0;
 745             }
 746         }
 747     }
 748
 749     /* encode chroma */
 750     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 751
 752     for( ch = 0; ch < 2; ch++ )
 753     {
 754         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
 755         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 756
 757         if( !b_bidir )
 758         {
 759             h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
 760                              h->mb.pic.p_fdec[1+ch],       FDEC_STRIDE,
 761                              mvp[0], mvp[1], 8, 8 );
 762         }
 763
 764         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
 765
 766         /* calculate dct DC */
 767         dct2x2[0][0] = dct4x4[0][0][0];
 768         dct2x2[0][1] = dct4x4[1][0][0];
 769         dct2x2[1][0] = dct4x4[2][0][0];
 770         dct2x2[1][1] = dct4x4[3][0][0];
 771         h->dctf.dct2x2dc( dct2x2 );
 772         quant_2x2_dc( h, dct2x2, (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 773         if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
 774         {
 775             /* can't be */
 776             return 0;
 777         }
 778
 779         /* calculate dct coeffs */
 780         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
 781         {
 782             quant_4x4( h, dct4x4[i4x4], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 783             scan_zigzag_4x4( dctscan, dct4x4[i4x4] );
 784
 785             i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
 786             if( i_decimate_mb >= 7 )
 787             {
 788                 return 0;
 789             }
 790         }
 791     }
 792
 793     return 1;
 794 }
 795
 796 /****************************************************************************
 797  * DCT-domain noise reduction / adaptive deadzone
 798  * from libavcodec
 799  ****************************************************************************/
 800
 801 void x264_noise_reduction_update( x264_t *h )
 802 {
 803     int cat, i;
 804     for( cat = 0; cat < 2; cat++ )
 805     {
 806         int size = cat ? 64 : 16;
 807         const int *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
 808
 809         if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
 810         {
 811             for( i = 0; i < size; i++ )
 812                 h->nr_residual_sum[cat][i] >>= 1;
 813             h->nr_count[cat] >>= 1;
 814         }
 815
 816         for( i = 0; i < size; i++ )
 817             h->nr_offset[cat][i] =
 818                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
 819                  + h->nr_residual_sum[cat][i]/2)
 820               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
 821     }
 822 }
 823
 824 void x264_denoise_dct( x264_t *h, int16_t *dct )
 825 {
 826     const int cat = h->mb.b_transform_8x8;
 827     int i;
 828
 829     h->nr_count[cat]++;
 830
 831     for( i = (cat ? 63 : 15); i >= 1; i-- )
 832     {
 833         int level = dct[i];
 834         if( level )
 835         {
 836             if( level > 0 )
 837             {
 838                 h->nr_residual_sum[cat][i] += level;
 839                 level -= h->nr_offset[cat][i];
 840                 if( level < 0 )
 841                     level = 0;
 842             }
 843             else
 844             {
 845                 h->nr_residual_sum[cat][i] -= level;
 846                 level += h->nr_offset[cat][i];
 847                 if( level > 0 )
 848                     level = 0;
 849             }
 850             dct[i] = level;
 851         }
 852     }
 853 }
 854
 855 /*****************************************************************************
 856  * RD only; 4 calls to this do not make up for one macroblock_encode.
 857  * doesn't transform chroma dc.
 858  *****************************************************************************/
 859 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
 860 {
 861     int i_qp = h->mb.i_qp;
 862     uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
 863     uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
 864     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
 865     int nnz8x8;
 866     int ch;
 867
 868     x264_mb_mc_8x8( h, i8 );
 869
 870     if( h->mb.b_transform_8x8 )
 871     {
 872         int16_t dct8x8[8][8];
 873         h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
 874         quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
 875         scan_zigzag_8x8full( h->dct.luma8x8[i8], dct8x8 );
 876
 877         if( b_decimate )
 878             nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
 879         else
 880             nnz8x8 = array_non_zero( (int*)dct8x8, sizeof(dct8x8)/sizeof(int) );
 881
 882         if( nnz8x8 )
 883         {
 884             h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
 885             h->dctf.add8x8_idct8( p_fdec, dct8x8 );
 886         }
 887     }
 888     else
 889     {
 890         int i4;
 891         int16_t dct4x4[4][4][4];
 892         h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
 893         quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 );
 894         quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 );
 895         quant_4x4( h, dct4x4[2], h->quant4_mf[CQM_4PY], i_qp, 0 );
 896         quant_4x4( h, dct4x4[3], h->quant4_mf[CQM_4PY], i_qp, 0 );
 897         for( i4 = 0; i4 < 4; i4++ )
 898             scan_zigzag_4x4full( h->dct.block[i8*4+i4].luma4x4, dct4x4[i4] );
 899
 900         if( b_decimate )
 901         {
 902             int i_decimate_8x8 = 0;
 903             for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
 904                 i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[i8*4+i4].luma4x4, 16 );
 905             nnz8x8 = 4 <= i_decimate_8x8;
 906         }
 907         else
 908             nnz8x8 = array_non_zero( (int*)dct4x4, sizeof(dct4x4)/sizeof(int) );
 909
 910         if( nnz8x8 )
 911         {
 912             for( i4 = 0; i4 < 4; i4++ )
 913                 h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
 914             h->dctf.add8x8_idct( p_fdec, dct4x4 );
 915         }
 916     }
 917
 918     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 919
 920     for( ch = 0; ch < 2; ch++ )
 921     {
 922         int16_t dct4x4[4][4];
 923         p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
 924         p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
 925
 926         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
 927         quant_4x4( h, dct4x4, h->quant4_mf[CQM_4PC], i_qp, 0 );
 928         scan_zigzag_4x4( h->dct.block[16+i8+ch*4].residual_ac, dct4x4 );
 929         if( array_non_zero( (int*)dct4x4, sizeof(dct4x4)/sizeof(int) ) )
 930         {
 931             h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
 932             h->dctf.add4x4_idct( p_fdec, dct4x4 );
 933         }
 934     }
 935
 936     if( nnz8x8 )
 937         h->mb.i_cbp_luma |= (1 << i8);
 938     else
 939         h->mb.i_cbp_luma &= ~(1 << i8);
 940     h->mb.i_cbp_chroma = 0x02;
 941 }