git.sesse.net Git - x264/blob - encoder/macroblock.c

   1 /*****************************************************************************
   2  * macroblock.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27
  28 #include "common/common.h"
  29 #include "macroblock.h"
  30
  31
  32 /* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
  33 /* FIXME this seems to make better decisions with cqm=jvt, but could screw up
  34  * with general custom matrices. */
  35 static const int def_quant4_mf[6][4][4] =
  36 {
  37     { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
  38       { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
  39     { { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 },
  40       { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 } },
  41     { { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 },
  42       { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 } },
  43     { {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 },
  44       {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 } },
  45     { {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 },
  46       {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 } },
  47     { {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 },
  48       {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 } }
  49 };
  50
  51 /****************************************************************************
  52  * Scan and Quant functions
  53  ****************************************************************************/
  54 //static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
  55 //static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
  56
  57 #define ZIG(i,y,x) level[i] = dct[y][x];
  58 static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] )
  59 {
  60     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
  61     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
  62     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)
  63     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)
  64     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)
  65     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)
  66     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)
  67     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)
  68     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)
  69     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)
  70     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)
  71     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)
  72     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)
  73     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)
  74     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)
  75     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
  76 }
  77 static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
  78 {
  79     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
  80     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
  81     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
  82     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
  83 }
  84 static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
  85 {
  86                 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
  87     ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
  88     ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
  89     ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
  90 }
  91 static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
  92 {
  93     ZIG(0,0,0)
  94     ZIG(1,0,1)
  95     ZIG(2,1,0)
  96     ZIG(3,1,1)
  97 }
  98 #undef ZIG
  99
 100 #define ZIG(i,y,x) {\
 101     int o = x+y*i_stride;\
 102     level[i] = p_src[o] - p_dst[o];\
 103     p_dst[o] = p_src[o];\
 104 }
 105 static inline void sub_zigzag_4x4full( int level[16], const uint8_t *p_src, uint8_t *p_dst, int i_stride )
 106 {
 107     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
 108     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
 109     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
 110     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 111 }
 112 static inline void sub_zigzag_4x4( int level[15], const uint8_t *p_src, uint8_t *p_dst, int i_stride )
 113 {
 114                 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
 115     ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
 116     ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
 117     ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
 118 }
 119 #undef ZIG
 120
 121 static void quant_8x8( x264_t *h, int16_t dct[8][8], int quant_mf[6][8][8], int i_qscale, int b_intra )
 122 {
 123     const int i_qbits = 16 + i_qscale / 6;
 124     const int i_mf = i_qscale % 6;
 125     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 126     h->quantf.quant_8x8_core( dct, quant_mf[i_mf], i_qbits, f );
 127 }
 128 static void quant_4x4( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
 129 {
 130     const int i_qbits = 15 + i_qscale / 6;
 131     const int i_mf = i_qscale % 6;
 132     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 133     h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
 134 }
 135 static void quant_4x4_dc( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale )
 136 {
 137     const int i_qbits = 16 + i_qscale / 6;
 138     const int i_mf = i_qscale % 6;
 139     const int f = ( 1 << i_qbits ) / 3;
 140     h->quantf.quant_4x4_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
 141 }
 142 static void quant_2x2_dc( x264_t *h, int16_t dct[2][2], int quant_mf[6][4][4], int i_qscale, int b_intra )
 143 {
 144     const int i_qbits = 16 + i_qscale / 6;
 145     const int i_mf = i_qscale % 6;
 146     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 147     h->quantf.quant_2x2_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
 148 }
 149
 150 /* (ref: JVT-B118)
 151  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
 152  * to 0 (low score means set it to null)
 153  * Used in inter macroblock (luma and chroma)
 154  *  luma: for a 8x8 block: if score < 4 -> null
 155  *        for the complete mb: if score < 6 -> null
 156  *  chroma: for the complete mb: if score < 7 -> null
 157  */
 158 static int x264_mb_decimate_score( int *dct, int i_max )
 159 {
 160     static const int i_ds_table4[16] = {
 161         3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
 162     static const int i_ds_table8[64] = {
 163         3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
 164         1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
 165         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 166         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
 167
 168     const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
 169     int i_score = 0;
 170     int idx = i_max - 1;
 171
 172     while( idx >= 0 && dct[idx] == 0 )
 173         idx--;
 174
 175     while( idx >= 0 )
 176     {
 177         int i_run;
 178
 179         if( abs( dct[idx--] ) > 1 )
 180             return 9;
 181
 182         i_run = 0;
 183         while( idx >= 0 && dct[idx] == 0 )
 184         {
 185             idx--;
 186             i_run++;
 187         }
 188         i_score += ds_table[i_run];
 189     }
 190
 191     return i_score;
 192 }
 193
 194 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
 195 {
 196     const int i_stride = h->mb.pic.i_stride[0];
 197     const int i_offset = 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride;
 198     uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
 199     uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
 200     int16_t dct4x4[4][4];
 201
 202     if( h->mb.b_lossless )
 203     {
 204         sub_zigzag_4x4full( h->dct.block[idx].luma4x4, p_src, p_dst, i_stride );
 205         return;
 206     }
 207
 208     h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 209
 210     if( h->mb.b_trellis )
 211         x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
 212     else
 213         quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
 214
 215     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
 216     x264_mb_dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
 217
 218     /* output samples to fdec */
 219     h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
 220 }
 221
 222 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
 223 {
 224     const int i_stride = h->mb.pic.i_stride[0];
 225     const int i_offset = 8 * (idx&1) + 8 * (idx>>1) * i_stride;
 226     uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
 227     uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
 228     int16_t dct8x8[8][8];
 229
 230     h->dctf.sub8x8_dct8( dct8x8, p_src, i_stride, p_dst, i_stride );
 231
 232     if( h->mb.b_trellis )
 233         x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
 234     else
 235         quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
 236
 237     scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
 238     x264_mb_dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
 239     h->dctf.add8x8_idct8( p_dst, i_stride, dct8x8 );
 240 }
 241
 242 static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
 243 {
 244     const int i_stride = h->mb.pic.i_stride[0];
 245     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 246     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 247
 248     int16_t dct4x4[16+1][4][4];
 249
 250     int i;
 251
 252     if( h->mb.b_lossless )
 253     {
 254         for( i = 0; i < 16; i++ )
 255         {
 256             int o = block_idx_x[i]*4 + block_idx_y[i]*4*i_stride;
 257             sub_zigzag_4x4( h->dct.block[i].residual_ac, p_src+o, p_dst+o, i_stride );
 258             dct4x4[0][block_idx_y[i]][block_idx_x[i]] = p_src[o] - p_dst[o];
 259             p_dst[o] = p_src[o];
 260         }
 261         scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
 262         return;
 263     }
 264
 265     h->dctf.sub16x16_dct( &dct4x4[1], p_src, i_stride, p_dst, i_stride );
 266     for( i = 0; i < 16; i++ )
 267     {
 268         /* copy dc coeff */
 269         dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
 270
 271         /* quant/scan/dequant */
 272         if( h->mb.b_trellis )
 273             x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
 274         else
 275             quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
 276
 277         scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
 278         x264_mb_dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
 279     }
 280
 281     h->dctf.dct4x4dc( dct4x4[0] );
 282     quant_4x4_dc( h, dct4x4[0], h->quant4_mf[CQM_4IY], i_qscale );
 283     scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
 284
 285     /* output samples to fdec */
 286     h->dctf.idct4x4dc( dct4x4[0] );
 287     x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
 288
 289     /* calculate dct coeffs */
 290     for( i = 0; i < 16; i++ )
 291     {
 292         /* copy dc coeff */
 293         dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
 294     }
 295     /* put pixels to fdec */
 296     h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] );
 297 }
 298
 299 static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
 300 {
 301     int i, ch;
 302
 303     for( ch = 0; ch < 2; ch++ )
 304     {
 305         const int i_stride = h->mb.pic.i_stride[1+ch];
 306         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
 307         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 308         int i_decimate_score = 0;
 309
 310         int16_t dct2x2[2][2];
 311         int16_t dct4x4[4][4][4];
 312
 313         if( h->mb.b_lossless )
 314         {
 315             for( i = 0; i < 4; i++ )
 316             {
 317                 int o = block_idx_x[i]*4 + block_idx_y[i]*4*i_stride;
 318                 sub_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, p_src+o, p_dst+o, i_stride );
 319                 h->dct.chroma_dc[ch][i] = p_src[o] - p_dst[o];
 320                 p_dst[o] = p_src[o];
 321             }
 322             continue;
 323         }
 324
 325         h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 326         /* calculate dct coeffs */
 327         for( i = 0; i < 4; i++ )
 328         {
 329             /* copy dc coeff */
 330             dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
 331
 332             /* no trellis; it doesn't seem to help chroma noticeably */
 333             quant_4x4( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
 334             scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
 335             x264_mb_dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
 336
 337             if( b_inter )
 338             {
 339                 i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
 340             }
 341         }
 342
 343         h->dctf.dct2x2dc( dct2x2 );
 344         quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
 345         scan_zigzag_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
 346
 347         /* output samples to fdec */
 348         h->dctf.idct2x2dc( dct2x2 );
 349         x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale );  /* XXX not inversed */
 350
 351         if( b_inter && i_decimate_score < 7 )
 352         {
 353             /* Near null chroma 8x8 block so make it null (bits saving) */
 354             memset( dct4x4, 0, sizeof( dct4x4 ) );
 355             memset( &h->dct.block[16+ch*4], 0, 4 * sizeof( *h->dct.block ) );
 356         }
 357
 358         /* calculate dct coeffs */
 359         for( i = 0; i < 4; i++ )
 360         {
 361             /* copy dc coeff */
 362             dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
 363         }
 364         h->dctf.add8x8_idct( p_dst, i_stride, dct4x4 );
 365     }
 366 }
 367
 368 static void x264_macroblock_encode_skip( x264_t *h )
 369 {
 370     int i;
 371     h->mb.i_cbp_luma = 0x00;
 372     h->mb.i_cbp_chroma = 0x00;
 373
 374     for( i = 0; i < 16+8; i++ )
 375     {
 376         h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
 377     }
 378
 379     /* store cbp */
 380     h->mb.cbp[h->mb.i_mb_xy] = 0;
 381 }
 382
 383 /*****************************************************************************
 384  * x264_macroblock_encode_pskip:
 385  *  Encode an already marked skip block
 386  *****************************************************************************/
 387 void x264_macroblock_encode_pskip( x264_t *h )
 388 {
 389     const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
 390                                 h->mb.mv_min[0], h->mb.mv_max[0] );
 391     const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
 392                                 h->mb.mv_min[1], h->mb.mv_max[1] );
 393
 394     /* Motion compensation XXX probably unneeded */
 395     h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
 396                     h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
 397                     mvx, mvy, 16, 16 );
 398
 399     /* Chroma MC */
 400     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
 401                       h->mb.pic.p_fdec[1],       h->mb.pic.i_stride[1],
 402                       mvx, mvy, 8, 8 );
 403
 404     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
 405                       h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
 406                       mvx, mvy, 8, 8 );
 407
 408     x264_macroblock_encode_skip( h );
 409 }
 410
 411 /*****************************************************************************
 412  * x264_macroblock_encode:
 413  *****************************************************************************/
 414 void x264_macroblock_encode( x264_t *h )
 415 {
 416     int i_cbp_dc = 0;
 417     int i_qp = h->mb.i_qp;
 418     int i;
 419
 420     if( h->mb.i_type == P_SKIP )
 421     {
 422         /* A bit special */
 423         x264_macroblock_encode_pskip( h );
 424         return;
 425     }
 426     if( h->mb.i_type == B_SKIP )
 427     {
 428         /* XXX motion compensation is probably unneeded */
 429         x264_mb_mc( h );
 430         x264_macroblock_encode_skip( h );
 431         return;
 432     }
 433
 434     if( h->mb.i_type == I_16x16 )
 435     {
 436         const int i_mode = h->mb.i_intra16x16_pred_mode;
 437         h->mb.b_transform_8x8 = 0;
 438         /* do the right prediction */
 439         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 440
 441         /* encode the 16x16 macroblock */
 442         x264_mb_encode_i16x16( h, i_qp );
 443     }
 444     else if( h->mb.i_type == I_8x8 )
 445     {
 446         h->mb.b_transform_8x8 = 1;
 447         for( i = 0; i < 4; i++ )
 448         {
 449             const int i_dst = h->mb.pic.i_stride[0];
 450             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * i_dst];
 451             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 452
 453             h->predict_8x8[i_mode]( p_dst, i_dst, h->mb.i_neighbour8[i] );
 454             x264_mb_encode_i8x8( h, i, i_qp );
 455         }
 456     }
 457     else if( h->mb.i_type == I_4x4 )
 458     {
 459         h->mb.b_transform_8x8 = 0;
 460         for( i = 0; i < 16; i++ )
 461         {
 462             const int i_dst = h->mb.pic.i_stride[0];
 463             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * i_dst];
 464             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 465
 466             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 467                 /* emulate missing topright samples */
 468                 *(uint32_t*) &p_dst[4 - i_dst] = p_dst[3 - i_dst] * 0x01010101U;
 469
 470             h->predict_4x4[i_mode]( p_dst, i_dst );
 471             x264_mb_encode_i4x4( h, i, i_qp );
 472         }
 473     }
 474     else    /* Inter MB */
 475     {
 476         int i8x8, i4x4, idx;
 477         int i_decimate_mb = 0;
 478
 479         /* Motion compensation */
 480         x264_mb_mc( h );
 481
 482         if( h->mb.b_lossless )
 483         {
 484             for( i4x4 = 0; i4x4 < 16; i4x4++ )
 485             {
 486                 int o = block_idx_x[i4x4]*4 + block_idx_y[i4x4]*4 * h->mb.pic.i_stride[0];
 487                 sub_zigzag_4x4full( h->dct.block[i4x4].luma4x4, h->mb.pic.p_fenc[0]+o, h->mb.pic.p_fdec[0]+o, h->mb.pic.i_stride[0] );
 488             }
 489         }
 490         else if( h->mb.b_transform_8x8 )
 491         {
 492             int16_t dct8x8[4][8][8];
 493             h->dctf.sub16x16_dct8( dct8x8,
 494                                    h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 495                                    h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 496
 497             for( idx = 0; idx < 4; idx++ )
 498             {
 499                 if( h->mb.b_trellis )
 500                     x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
 501                 else
 502                     quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
 503
 504                 scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
 505                 x264_mb_dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
 506
 507                 if( !h->mb.b_trellis )
 508                 {
 509                     int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
 510                     i_decimate_mb += i_decimate_8x8;
 511                     if( i_decimate_8x8 < 4 )
 512                     {
 513                         memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) );
 514                         memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) );
 515                     }
 516                 }
 517             }
 518
 519             if( i_decimate_mb < 6 && !h->mb.b_trellis )
 520                 memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) );
 521             else
 522                 h->dctf.add16x16_idct8( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct8x8 );
 523         }
 524         else
 525         {
 526             int16_t dct4x4[16][4][4];
 527             h->dctf.sub16x16_dct( dct4x4,
 528                                   h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 529                                   h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 530
 531             for( i8x8 = 0; i8x8 < 4; i8x8++ )
 532             {
 533                 int i_decimate_8x8;
 534
 535                 /* encode one 4x4 block */
 536                 i_decimate_8x8 = 0;
 537                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
 538                 {
 539                     idx = i8x8 * 4 + i4x4;
 540
 541                     if( h->mb.b_trellis )
 542                         x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
 543                     else
 544                         quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
 545
 546                     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
 547                     x264_mb_dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
 548
 549                     i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
 550                 }
 551
 552                 /* decimate this 8x8 block */
 553                 i_decimate_mb += i_decimate_8x8;
 554                 if( i_decimate_8x8 < 4 )
 555                 {
 556                     memset( &dct4x4[i8x8*4], 0, 4 * sizeof( *dct4x4 ) );
 557                     memset( &h->dct.block[i8x8*4], 0, 4 * sizeof( *h->dct.block ) );
 558                 }
 559             }
 560
 561             if( i_decimate_mb < 6 )
 562                 memset( h->dct.block, 0, 16 * sizeof( *h->dct.block ) );
 563             else
 564                 h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 );
 565         }
 566     }
 567
 568     /* encode chroma */
 569     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 570     if( IS_INTRA( h->mb.i_type ) )
 571     {
 572         const int i_mode = h->mb.i_chroma_pred_mode;
 573         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
 574         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
 575     }
 576
 577     /* encode the 8x8 blocks */
 578     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qp );
 579
 580     /* Calculate the Luma/Chroma patern and non_zero_count */
 581     h->mb.i_cbp_luma = 0x00;
 582     if( h->mb.i_type == I_16x16 )
 583     {
 584         for( i = 0; i < 16; i++ )
 585         {
 586             const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
 587             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
 588             if( nz > 0 )
 589                 h->mb.i_cbp_luma = 0x0f;
 590         }
 591     }
 592     else if( h->mb.b_transform_8x8 )
 593     {
 594         /* coded_block_flag is enough for CABAC.
 595          * the full non_zero_count is done only in CAVLC. */
 596         for( i = 0; i < 4; i++ )
 597         {
 598             const int nz = array_non_zero( h->dct.luma8x8[i], 64 );
 599             int j;
 600             for( j = 0; j < 4; j++ )
 601                 h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz;
 602             if( nz > 0 )
 603                 h->mb.i_cbp_luma |= 1 << i;
 604         }
 605     }
 606     else
 607     {
 608         for( i = 0; i < 16; i++ )
 609         {
 610             const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
 611             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
 612             if( nz > 0 )
 613                 h->mb.i_cbp_luma |= 1 << (i/4);
 614         }
 615     }
 616
 617     /* Calculate the chroma patern */
 618     h->mb.i_cbp_chroma = 0x00;
 619     for( i = 0; i < 8; i++ )
 620     {
 621         const int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
 622         h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
 623         if( nz > 0 )
 624         {
 625             h->mb.i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
 626         }
 627     }
 628     if( h->mb.i_cbp_chroma == 0x00 &&
 629         ( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 || array_non_zero_count( h->dct.chroma_dc[1], 4 ) ) > 0 )
 630     {
 631         h->mb.i_cbp_chroma = 0x01;    /* dc only */
 632     }
 633
 634     if( h->param.b_cabac )
 635     {
 636         if( h->mb.i_type == I_16x16 && array_non_zero_count( h->dct.luma16x16_dc, 16 ) > 0 )
 637             i_cbp_dc = 0x01;
 638         else
 639             i_cbp_dc = 0x00;
 640
 641         if( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 )
 642             i_cbp_dc |= 0x02;
 643         if( array_non_zero_count( h->dct.chroma_dc[1], 4 ) > 0 )
 644             i_cbp_dc |= 0x04;
 645     }
 646
 647     /* store cbp */
 648     h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
 649
 650     /* Check for P_SKIP
 651      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
 652      *      (if multiple mv give same result)*/
 653     if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
 654         h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
 655         h->mb.cache.ref[0][x264_scan8[0]] == 0 )
 656     {
 657         int mvp[2];
 658
 659         x264_mb_predict_mv_pskip( h, mvp );
 660         if( h->mb.cache.mv[0][x264_scan8[0]][0] == mvp[0] &&
 661             h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
 662         {
 663             h->mb.i_type = P_SKIP;
 664         }
 665     }
 666
 667     /* Check for B_SKIP */
 668     if( h->mb.i_type == B_DIRECT &&
 669         h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
 670     {
 671         h->mb.i_type = B_SKIP;
 672     }
 673 }
 674
 675 /*****************************************************************************
 676  * x264_macroblock_probe_skip:
 677  *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
 678  *  the previous QP
 679  *****************************************************************************/
 680 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 681 {
 682     DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
 683     DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
 684     DECLARE_ALIGNED( int,     dctscan[16], 16 );
 685
 686     int i_qp = h->mb.i_qp;
 687     int mvp[2];
 688     int ch;
 689
 690     int i8x8, i4x4;
 691     int i_decimate_mb;
 692
 693     if( !b_bidir )
 694     {
 695         /* Get the MV */
 696         x264_mb_predict_mv_pskip( h, mvp );
 697         mvp[0] = x264_clip3( mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
 698         mvp[1] = x264_clip3( mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
 699
 700         /* Motion compensation */
 701         h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
 702                         h->mb.pic.p_fdec[0],   h->mb.pic.i_stride[0],
 703                         mvp[0], mvp[1], 16, 16 );
 704     }
 705
 706     /* get luma diff */
 707     h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 708                                   h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 709
 710     for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
 711     {
 712         /* encode one 4x4 block */
 713         for( i4x4 = 0; i4x4 < 4; i4x4++ )
 714         {
 715             const int idx = i8x8 * 4 + i4x4;
 716
 717             quant_4x4( h, dct4x4[idx], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 718             scan_zigzag_4x4full( dctscan, dct4x4[idx] );
 719
 720             i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
 721
 722             if( i_decimate_mb >= 6 )
 723             {
 724                 /* not as P_SKIP */
 725                 return 0;
 726             }
 727         }
 728     }
 729
 730     /* encode chroma */
 731     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 732
 733     for( ch = 0; ch < 2; ch++ )
 734     {
 735         const int i_stride = h->mb.pic.i_stride[1+ch];
 736         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
 737         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 738
 739         if( !b_bidir )
 740         {
 741             h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4+ch], i_stride,
 742                               h->mb.pic.p_fdec[1+ch],       i_stride,
 743                               mvp[0], mvp[1], 8, 8 );
 744         }
 745
 746         h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 747
 748         /* calculate dct DC */
 749         dct2x2[0][0] = dct4x4[0][0][0];
 750         dct2x2[0][1] = dct4x4[1][0][0];
 751         dct2x2[1][0] = dct4x4[2][0][0];
 752         dct2x2[1][1] = dct4x4[3][0][0];
 753         h->dctf.dct2x2dc( dct2x2 );
 754         quant_2x2_dc( h, dct2x2, (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 755         if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
 756         {
 757             /* can't be */
 758             return 0;
 759         }
 760
 761         /* calculate dct coeffs */
 762         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
 763         {
 764             quant_4x4( h, dct4x4[i4x4], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 765             scan_zigzag_4x4( dctscan, dct4x4[i4x4] );
 766
 767             i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
 768             if( i_decimate_mb >= 7 )
 769             {
 770                 return 0;
 771             }
 772         }
 773     }
 774
 775     return 1;
 776 }