git.sesse.net Git - x264/blob - encoder/macroblock.c

   1 /*****************************************************************************
   2  * macroblock.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27
  28 #include "common/common.h"
  29 #include "macroblock.h"
  30
  31
  32 /* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
  33 /* FIXME this seems to make better decisions with cqm=jvt, but could screw up
  34  * with general custom matrices. */
  35 static const int def_quant4_mf[6][4][4] =
  36 {
  37     { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
  38       { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
  39     { { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 },
  40       { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 } },
  41     { { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 },
  42       { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 } },
  43     { {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 },
  44       {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 } },
  45     { {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 },
  46       {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 } },
  47     { {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 },
  48       {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 } }
  49 };
  50
  51 /****************************************************************************
  52  * Scan and Quant functions
  53  ****************************************************************************/
  54 //static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
  55 //static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
  56
  57 #define ZIG(i,y,x) level[i] = dct[y][x];
  58 static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] )
  59 {
  60     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
  61     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
  62     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)
  63     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)
  64     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)
  65     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)
  66     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)
  67     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)
  68     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)
  69     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)
  70     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)
  71     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)
  72     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)
  73     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)
  74     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)
  75     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
  76 }
  77 static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
  78 {
  79     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
  80     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
  81     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
  82     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
  83 }
  84 static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
  85 {
  86                 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
  87     ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
  88     ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
  89     ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
  90 }
  91 static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
  92 {
  93     ZIG(0,0,0)
  94     ZIG(1,0,1)
  95     ZIG(2,1,0)
  96     ZIG(3,1,1)
  97 }
  98 #undef ZIG
  99
 100 #define ZIG(i,y,x) {\
 101     int o = x+y*i_stride;\
 102     level[i] = p_src[o] - p_dst[o];\
 103     p_dst[o] = p_src[o];\
 104 }
 105 static inline void sub_zigzag_4x4full( int level[16], const uint8_t *p_src, uint8_t *p_dst, int i_stride )
 106 {
 107     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
 108     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
 109     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
 110     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 111 }
 112 static inline void sub_zigzag_4x4( int level[15], const uint8_t *p_src, uint8_t *p_dst, int i_stride )
 113 {
 114                 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
 115     ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
 116     ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
 117     ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
 118 }
 119 #undef ZIG
 120
 121 static void quant_8x8( x264_t *h, int16_t dct[8][8], int quant_mf[6][8][8], int i_qscale, int b_intra )
 122 {
 123     const int i_qbits = 16 + i_qscale / 6;
 124     const int i_mf = i_qscale % 6;
 125     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 126     h->quantf.quant_8x8_core( dct, quant_mf[i_mf], i_qbits, f );
 127 }
 128 static void quant_4x4( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
 129 {
 130     const int i_qbits = 15 + i_qscale / 6;
 131     const int i_mf = i_qscale % 6;
 132     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 133     h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
 134 }
 135 static void quant_4x4_dc( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale )
 136 {
 137     const int i_qbits = 16 + i_qscale / 6;
 138     const int i_mf = i_qscale % 6;
 139     const int f = ( 1 << i_qbits ) / 3;
 140     h->quantf.quant_4x4_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
 141 }
 142 static void quant_2x2_dc( x264_t *h, int16_t dct[2][2], int quant_mf[6][4][4], int i_qscale, int b_intra )
 143 {
 144     const int i_qbits = 16 + i_qscale / 6;
 145     const int i_mf = i_qscale % 6;
 146     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 147     h->quantf.quant_2x2_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
 148 }
 149
 150 /* (ref: JVT-B118)
 151  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
 152  * to 0 (low score means set it to null)
 153  * Used in inter macroblock (luma and chroma)
 154  *  luma: for a 8x8 block: if score < 4 -> null
 155  *        for the complete mb: if score < 6 -> null
 156  *  chroma: for the complete mb: if score < 7 -> null
 157  */
 158 static int x264_mb_decimate_score( int *dct, int i_max )
 159 {
 160     static const int i_ds_table4[16] = {
 161         3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
 162     static const int i_ds_table8[64] = {
 163         3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
 164         1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
 165         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 166         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
 167
 168     const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
 169     int i_score = 0;
 170     int idx = i_max - 1;
 171
 172     while( idx >= 0 && dct[idx] == 0 )
 173         idx--;
 174
 175     while( idx >= 0 )
 176     {
 177         int i_run;
 178
 179         if( abs( dct[idx--] ) > 1 )
 180             return 9;
 181
 182         i_run = 0;
 183         while( idx >= 0 && dct[idx] == 0 )
 184         {
 185             idx--;
 186             i_run++;
 187         }
 188         i_score += ds_table[i_run];
 189     }
 190
 191     return i_score;
 192 }
 193
 194 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
 195 {
 196     const int i_stride = h->mb.pic.i_stride[0];
 197     const int i_offset = 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride;
 198     uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
 199     uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
 200     int16_t dct4x4[4][4];
 201
 202     if( h->mb.b_lossless )
 203     {
 204         sub_zigzag_4x4full( h->dct.block[idx].luma4x4, p_src, p_dst, i_stride );
 205         return;
 206     }
 207
 208     h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 209
 210     if( h->mb.b_noise_reduction )
 211         x264_denoise_dct( h, (int16_t*)dct4x4 );
 212     if( h->mb.b_trellis )
 213         x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
 214     else
 215         quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
 216
 217     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
 218     h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
 219
 220     /* output samples to fdec */
 221     h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
 222 }
 223
 224 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
 225 {
 226     const int i_stride = h->mb.pic.i_stride[0];
 227     const int i_offset = 8 * (idx&1) + 8 * (idx>>1) * i_stride;
 228     uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
 229     uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
 230     int16_t dct8x8[8][8];
 231
 232     h->dctf.sub8x8_dct8( dct8x8, p_src, i_stride, p_dst, i_stride );
 233
 234     if( h->mb.b_noise_reduction )
 235         x264_denoise_dct( h, (int16_t*)dct8x8 );
 236     if( h->mb.b_trellis )
 237         x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
 238     else
 239         quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
 240
 241     scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
 242     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
 243     h->dctf.add8x8_idct8( p_dst, i_stride, dct8x8 );
 244 }
 245
 246 static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
 247 {
 248     const int i_stride = h->mb.pic.i_stride[0];
 249     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 250     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 251
 252     int16_t dct4x4[16+1][4][4];
 253
 254     int i;
 255
 256     if( h->mb.b_lossless )
 257     {
 258         for( i = 0; i < 16; i++ )
 259         {
 260             int o = block_idx_x[i]*4 + block_idx_y[i]*4*i_stride;
 261             sub_zigzag_4x4( h->dct.block[i].residual_ac, p_src+o, p_dst+o, i_stride );
 262             dct4x4[0][block_idx_y[i]][block_idx_x[i]] = p_src[o] - p_dst[o];
 263             p_dst[o] = p_src[o];
 264         }
 265         scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
 266         return;
 267     }
 268
 269     h->dctf.sub16x16_dct( &dct4x4[1], p_src, i_stride, p_dst, i_stride );
 270     for( i = 0; i < 16; i++ )
 271     {
 272         /* copy dc coeff */
 273         dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
 274
 275         /* quant/scan/dequant */
 276         if( h->mb.b_noise_reduction )
 277             x264_denoise_dct( h, (int16_t*)dct4x4[i] );
 278         if( h->mb.b_trellis )
 279             x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
 280         else
 281             quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
 282
 283         scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
 284         h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
 285     }
 286
 287     h->dctf.dct4x4dc( dct4x4[0] );
 288     quant_4x4_dc( h, dct4x4[0], h->quant4_mf[CQM_4IY], i_qscale );
 289     scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
 290
 291     /* output samples to fdec */
 292     h->dctf.idct4x4dc( dct4x4[0] );
 293     x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
 294
 295     /* calculate dct coeffs */
 296     for( i = 0; i < 16; i++ )
 297     {
 298         /* copy dc coeff */
 299         dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
 300     }
 301     /* put pixels to fdec */
 302     h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] );
 303 }
 304
 305 static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
 306 {
 307     int i, ch;
 308
 309     for( ch = 0; ch < 2; ch++ )
 310     {
 311         const int i_stride = h->mb.pic.i_stride[1+ch];
 312         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
 313         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 314         int i_decimate_score = 0;
 315
 316         int16_t dct2x2[2][2];
 317         int16_t dct4x4[4][4][4];
 318
 319         if( h->mb.b_lossless )
 320         {
 321             for( i = 0; i < 4; i++ )
 322             {
 323                 int o = block_idx_x[i]*4 + block_idx_y[i]*4*i_stride;
 324                 sub_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, p_src+o, p_dst+o, i_stride );
 325                 h->dct.chroma_dc[ch][i] = p_src[o] - p_dst[o];
 326                 p_dst[o] = p_src[o];
 327             }
 328             continue;
 329         }
 330
 331         h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 332         /* calculate dct coeffs */
 333         for( i = 0; i < 4; i++ )
 334         {
 335             /* copy dc coeff */
 336             dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
 337
 338             /* no trellis; it doesn't seem to help chroma noticeably */
 339             quant_4x4( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
 340             scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
 341
 342             if( b_inter )
 343             {
 344                 i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
 345             }
 346         }
 347
 348         h->dctf.dct2x2dc( dct2x2 );
 349         quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
 350         scan_zigzag_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
 351
 352         /* output samples to fdec */
 353         h->dctf.idct2x2dc( dct2x2 );
 354         x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale );  /* XXX not inversed */
 355
 356         if( b_inter && i_decimate_score < 7 )
 357         {
 358             /* Near null chroma 8x8 block so make it null (bits saving) */
 359             memset( dct4x4, 0, sizeof( dct4x4 ) );
 360             memset( &h->dct.block[16+ch*4], 0, 4 * sizeof( *h->dct.block ) );
 361         }
 362         else
 363         {
 364             for( i = 0; i < 4; i++ )
 365                 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
 366         }
 367
 368         /* calculate dct coeffs */
 369         for( i = 0; i < 4; i++ )
 370         {
 371             /* copy dc coeff */
 372             dct4x4[i][0][0] = dct2x2[0][i];
 373         }
 374         h->dctf.add8x8_idct( p_dst, i_stride, dct4x4 );
 375     }
 376 }
 377
 378 static void x264_macroblock_encode_skip( x264_t *h )
 379 {
 380     int i;
 381     h->mb.i_cbp_luma = 0x00;
 382     h->mb.i_cbp_chroma = 0x00;
 383
 384     for( i = 0; i < 16+8; i++ )
 385     {
 386         h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
 387     }
 388
 389     /* store cbp */
 390     h->mb.cbp[h->mb.i_mb_xy] = 0;
 391 }
 392
 393 /*****************************************************************************
 394  * x264_macroblock_encode_pskip:
 395  *  Encode an already marked skip block
 396  *****************************************************************************/
 397 void x264_macroblock_encode_pskip( x264_t *h )
 398 {
 399     const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
 400                                 h->mb.mv_min[0], h->mb.mv_max[0] );
 401     const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
 402                                 h->mb.mv_min[1], h->mb.mv_max[1] );
 403
 404     /* Motion compensation XXX probably unneeded */
 405     h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
 406                     h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
 407                     mvx, mvy, 16, 16 );
 408
 409     /* Chroma MC */
 410     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
 411                       h->mb.pic.p_fdec[1],       h->mb.pic.i_stride[1],
 412                       mvx, mvy, 8, 8 );
 413
 414     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
 415                       h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
 416                       mvx, mvy, 8, 8 );
 417
 418     x264_macroblock_encode_skip( h );
 419 }
 420
 421 /*****************************************************************************
 422  * x264_macroblock_encode:
 423  *****************************************************************************/
 424 void x264_macroblock_encode( x264_t *h )
 425 {
 426     int i_cbp_dc = 0;
 427     int i_qp = h->mb.i_qp;
 428     int i;
 429
 430     if( h->mb.i_type == P_SKIP )
 431     {
 432         /* A bit special */
 433         x264_macroblock_encode_pskip( h );
 434         return;
 435     }
 436     if( h->mb.i_type == B_SKIP )
 437     {
 438         /* XXX motion compensation is probably unneeded */
 439         x264_mb_mc( h );
 440         x264_macroblock_encode_skip( h );
 441         return;
 442     }
 443
 444     if( h->mb.i_type == I_16x16 )
 445     {
 446         const int i_mode = h->mb.i_intra16x16_pred_mode;
 447         h->mb.b_transform_8x8 = 0;
 448         /* do the right prediction */
 449         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 450
 451         /* encode the 16x16 macroblock */
 452         x264_mb_encode_i16x16( h, i_qp );
 453     }
 454     else if( h->mb.i_type == I_8x8 )
 455     {
 456         h->mb.b_transform_8x8 = 1;
 457         for( i = 0; i < 4; i++ )
 458         {
 459             const int i_dst = h->mb.pic.i_stride[0];
 460             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * i_dst];
 461             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 462
 463             h->predict_8x8[i_mode]( p_dst, i_dst, h->mb.i_neighbour8[i] );
 464             x264_mb_encode_i8x8( h, i, i_qp );
 465         }
 466     }
 467     else if( h->mb.i_type == I_4x4 )
 468     {
 469         h->mb.b_transform_8x8 = 0;
 470         for( i = 0; i < 16; i++ )
 471         {
 472             const int i_dst = h->mb.pic.i_stride[0];
 473             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * i_dst];
 474             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 475
 476             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 477                 /* emulate missing topright samples */
 478                 *(uint32_t*) &p_dst[4 - i_dst] = p_dst[3 - i_dst] * 0x01010101U;
 479
 480             h->predict_4x4[i_mode]( p_dst, i_dst );
 481             x264_mb_encode_i4x4( h, i, i_qp );
 482         }
 483     }
 484     else    /* Inter MB */
 485     {
 486         int i8x8, i4x4, idx;
 487         int i_decimate_mb = 0;
 488
 489         /* Motion compensation */
 490         x264_mb_mc( h );
 491
 492         if( h->mb.b_lossless )
 493         {
 494             for( i4x4 = 0; i4x4 < 16; i4x4++ )
 495             {
 496                 int o = block_idx_x[i4x4]*4 + block_idx_y[i4x4]*4 * h->mb.pic.i_stride[0];
 497                 sub_zigzag_4x4full( h->dct.block[i4x4].luma4x4, h->mb.pic.p_fenc[0]+o, h->mb.pic.p_fdec[0]+o, h->mb.pic.i_stride[0] );
 498             }
 499         }
 500         else if( h->mb.b_transform_8x8 )
 501         {
 502             int16_t dct8x8[4][8][8];
 503             int nnz8x8[4] = {1,1,1,1};
 504             h->dctf.sub16x16_dct8( dct8x8,
 505                                    h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 506                                    h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 507
 508             for( idx = 0; idx < 4; idx++ )
 509             {
 510                 if( h->mb.b_noise_reduction )
 511                     x264_denoise_dct( h, (int16_t*)dct8x8[idx] );
 512                 if( h->mb.b_trellis )
 513                     x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
 514                 else
 515                     quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
 516
 517                 scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
 518
 519                 if( !h->mb.b_trellis )
 520                 {
 521                     int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
 522                     i_decimate_mb += i_decimate_8x8;
 523                     if( i_decimate_8x8 < 4 )
 524                     {
 525                         memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) );
 526                         memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) );
 527                         nnz8x8[idx] = 0;
 528                     }
 529                 }
 530             }
 531
 532             if( i_decimate_mb < 6 && !h->mb.b_trellis )
 533                 memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) );
 534             else
 535             {
 536                 const int stride = h->mb.pic.i_stride[0];
 537                 for( idx = 0; idx < 4; idx++ )
 538                     if( nnz8x8[idx] )
 539                     {
 540                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
 541                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*stride], stride, dct8x8[idx] );
 542                     }
 543             }
 544         }
 545         else
 546         {
 547             int16_t dct4x4[16][4][4];
 548             int nnz8x8[4] = {1,1,1,1};
 549             h->dctf.sub16x16_dct( dct4x4,
 550                                   h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 551                                   h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 552
 553             for( i8x8 = 0; i8x8 < 4; i8x8++ )
 554             {
 555                 int i_decimate_8x8;
 556
 557                 /* encode one 4x4 block */
 558                 i_decimate_8x8 = 0;
 559                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
 560                 {
 561                     idx = i8x8 * 4 + i4x4;
 562
 563                     if( h->mb.b_noise_reduction )
 564                         x264_denoise_dct( h, (int16_t*)dct4x4[idx] );
 565                     if( h->mb.b_trellis )
 566                         x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
 567                     else
 568                         quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
 569
 570                     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
 571
 572                     i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
 573                 }
 574
 575                 /* decimate this 8x8 block */
 576                 i_decimate_mb += i_decimate_8x8;
 577                 if( i_decimate_8x8 < 4 )
 578                 {
 579                     memset( &dct4x4[i8x8*4], 0, 4 * sizeof( *dct4x4 ) );
 580                     memset( &h->dct.block[i8x8*4], 0, 4 * sizeof( *h->dct.block ) );
 581                     nnz8x8[i8x8] = 0;
 582                 }
 583             }
 584
 585             if( i_decimate_mb < 6 )
 586                 memset( h->dct.block, 0, 16 * sizeof( *h->dct.block ) );
 587             else
 588             {
 589                 const int stride = h->mb.pic.i_stride[0];
 590                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
 591                     if( nnz8x8[i8x8] )
 592                     {
 593                         for( i = 0; i < 4; i++ )
 594                             h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
 595                         h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*stride], stride, &dct4x4[i8x8*4] );
 596                     }
 597             }
 598         }
 599     }
 600
 601     /* encode chroma */
 602     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 603     if( IS_INTRA( h->mb.i_type ) )
 604     {
 605         const int i_mode = h->mb.i_chroma_pred_mode;
 606         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
 607         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
 608     }
 609
 610     /* encode the 8x8 blocks */
 611     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qp );
 612
 613     /* Calculate the Luma/Chroma patern and non_zero_count */
 614     h->mb.i_cbp_luma = 0x00;
 615     if( h->mb.i_type == I_16x16 )
 616     {
 617         for( i = 0; i < 16; i++ )
 618         {
 619             const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
 620             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
 621             if( nz > 0 )
 622                 h->mb.i_cbp_luma = 0x0f;
 623         }
 624     }
 625     else if( h->mb.b_transform_8x8 )
 626     {
 627         /* coded_block_flag is enough for CABAC.
 628          * the full non_zero_count is done only in CAVLC. */
 629         for( i = 0; i < 4; i++ )
 630         {
 631             const int nz = array_non_zero( h->dct.luma8x8[i], 64 );
 632             int j;
 633             for( j = 0; j < 4; j++ )
 634                 h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz;
 635             if( nz > 0 )
 636                 h->mb.i_cbp_luma |= 1 << i;
 637         }
 638     }
 639     else
 640     {
 641         for( i = 0; i < 16; i++ )
 642         {
 643             const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
 644             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
 645             if( nz > 0 )
 646                 h->mb.i_cbp_luma |= 1 << (i/4);
 647         }
 648     }
 649
 650     /* Calculate the chroma patern */
 651     h->mb.i_cbp_chroma = 0x00;
 652     for( i = 0; i < 8; i++ )
 653     {
 654         const int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
 655         h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
 656         if( nz > 0 )
 657         {
 658             h->mb.i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
 659         }
 660     }
 661     if( h->mb.i_cbp_chroma == 0x00 && array_non_zero( h->dct.chroma_dc[0], 8 ) )
 662     {
 663         h->mb.i_cbp_chroma = 0x01;    /* dc only */
 664     }
 665
 666     if( h->param.b_cabac )
 667     {
 668         i_cbp_dc = ( h->mb.i_type == I_16x16 && array_non_zero( h->dct.luma16x16_dc, 16 ) )
 669                  | array_non_zero( h->dct.chroma_dc[0], 4 ) << 1
 670                  | array_non_zero( h->dct.chroma_dc[1], 4 ) << 2;
 671     }
 672
 673     /* store cbp */
 674     h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
 675
 676     /* Check for P_SKIP
 677      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
 678      *      (if multiple mv give same result)*/
 679     if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
 680         h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
 681         h->mb.cache.ref[0][x264_scan8[0]] == 0 )
 682     {
 683         int mvp[2];
 684
 685         x264_mb_predict_mv_pskip( h, mvp );
 686         if( h->mb.cache.mv[0][x264_scan8[0]][0] == mvp[0] &&
 687             h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
 688         {
 689             h->mb.i_type = P_SKIP;
 690         }
 691     }
 692
 693     /* Check for B_SKIP */
 694     if( h->mb.i_type == B_DIRECT &&
 695         h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
 696     {
 697         h->mb.i_type = B_SKIP;
 698     }
 699 }
 700
 701 /*****************************************************************************
 702  * x264_macroblock_probe_skip:
 703  *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
 704  *  the previous QP
 705  *****************************************************************************/
 706 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 707 {
 708     DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
 709     DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
 710     DECLARE_ALIGNED( int,     dctscan[16], 16 );
 711
 712     int i_qp = h->mb.i_qp;
 713     int mvp[2];
 714     int ch;
 715
 716     int i8x8, i4x4;
 717     int i_decimate_mb;
 718
 719     if( !b_bidir )
 720     {
 721         /* Get the MV */
 722         x264_mb_predict_mv_pskip( h, mvp );
 723         mvp[0] = x264_clip3( mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
 724         mvp[1] = x264_clip3( mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
 725
 726         /* Motion compensation */
 727         h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
 728                         h->mb.pic.p_fdec[0],   h->mb.pic.i_stride[0],
 729                         mvp[0], mvp[1], 16, 16 );
 730     }
 731
 732     /* get luma diff */
 733     h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 734                                   h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 735
 736     for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
 737     {
 738         /* encode one 4x4 block */
 739         for( i4x4 = 0; i4x4 < 4; i4x4++ )
 740         {
 741             const int idx = i8x8 * 4 + i4x4;
 742
 743             quant_4x4( h, dct4x4[idx], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 744             scan_zigzag_4x4full( dctscan, dct4x4[idx] );
 745
 746             i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
 747
 748             if( i_decimate_mb >= 6 )
 749             {
 750                 /* not as P_SKIP */
 751                 return 0;
 752             }
 753         }
 754     }
 755
 756     /* encode chroma */
 757     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 758
 759     for( ch = 0; ch < 2; ch++ )
 760     {
 761         const int i_stride = h->mb.pic.i_stride[1+ch];
 762         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
 763         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 764
 765         if( !b_bidir )
 766         {
 767             h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4+ch], i_stride,
 768                               h->mb.pic.p_fdec[1+ch],       i_stride,
 769                               mvp[0], mvp[1], 8, 8 );
 770         }
 771
 772         h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 773
 774         /* calculate dct DC */
 775         dct2x2[0][0] = dct4x4[0][0][0];
 776         dct2x2[0][1] = dct4x4[1][0][0];
 777         dct2x2[1][0] = dct4x4[2][0][0];
 778         dct2x2[1][1] = dct4x4[3][0][0];
 779         h->dctf.dct2x2dc( dct2x2 );
 780         quant_2x2_dc( h, dct2x2, (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 781         if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
 782         {
 783             /* can't be */
 784             return 0;
 785         }
 786
 787         /* calculate dct coeffs */
 788         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
 789         {
 790             quant_4x4( h, dct4x4[i4x4], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 791             scan_zigzag_4x4( dctscan, dct4x4[i4x4] );
 792
 793             i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
 794             if( i_decimate_mb >= 7 )
 795             {
 796                 return 0;
 797             }
 798         }
 799     }
 800
 801     return 1;
 802 }
 803
 804 /****************************************************************************
 805  * DCT-domain noise reduction / adaptive deadzone
 806  * from libavcodec
 807  ****************************************************************************/
 808
 809 void x264_noise_reduction_update( x264_t *h )
 810 {
 811     int cat, i;
 812     for( cat = 0; cat < 4; cat++ )
 813     {
 814         int b_8x8 = cat >= 2;
 815         int size = b_8x8 ? 64 : 16;
 816         const int *weight = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
 817
 818         if( h->nr_count[cat] > (b_8x8 ? (1<<16) : (1<<18)) )
 819         {
 820             for( i = 0; i < size; i++ )
 821                 h->nr_residual_sum[cat][i] >>= 1;
 822             h->nr_count[cat] >>= 1;
 823         }
 824
 825         for( i = 0; i < size; i++ )
 826             h->nr_offset[cat][i] =
 827                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
 828                  + h->nr_residual_sum[cat][i]/2)
 829               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
 830     }
 831 }
 832
 833 void x264_denoise_dct( x264_t *h, int16_t *dct )
 834 {
 835     const int cat = !IS_INTRA(h->mb.i_type) + 2*h->mb.b_transform_8x8;
 836     int i;
 837
 838     h->nr_count[cat]++;
 839
 840     for( i = (cat >= 2 ? 63 : 15); i >= 1; i-- )
 841     {
 842         int level = dct[i];
 843         if( level )
 844         {
 845             if( level > 0 )
 846             {
 847                 h->nr_residual_sum[cat][i] += level;
 848                 level -= h->nr_offset[cat][i];
 849                 if( level < 0 )
 850                     level = 0;
 851             }
 852             else
 853             {
 854                 h->nr_residual_sum[cat][i] -= level;
 855                 level += h->nr_offset[cat][i];
 856                 if( level > 0 )
 857                     level = 0;
 858             }
 859             dct[i] = level;
 860         }
 861     }
 862 }