git.sesse.net Git - x264/blob - encoder/macroblock.c

   1 /*****************************************************************************
   2  * macroblock.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include <stdlib.h>
  25 #include <stdio.h>
  26 #include <string.h>
  27
  28 #include "common/common.h"
  29 #include "macroblock.h"
  30
  31
  32 /* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
  33 /* FIXME this seems to make better decisions with cqm=jvt, but could screw up
  34  * with general custom matrices. */
  35 static const int def_quant4_mf[6][4][4] =
  36 {
  37     { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
  38       { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
  39     { { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 },
  40       { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 } },
  41     { { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 },
  42       { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 } },
  43     { {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 },
  44       {  9362, 5825,  9362, 5825 }, { 5825, 3647, 5825, 3647 } },
  45     { {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 },
  46       {  8192, 5243,  8192, 5243 }, { 5243, 3355, 5243, 3355 } },
  47     { {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 },
  48       {  7282, 4559,  7282, 4559 }, { 4559, 2893, 4559, 2893 } }
  49 };
  50
  51 /****************************************************************************
  52  * Scan and Quant functions
  53  ****************************************************************************/
  54 //static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3};
  55 //static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3};
  56
  57 #define ZIG(i,y,x) level[i] = dct[y][x];
  58 static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] )
  59 {
  60     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
  61     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
  62     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)
  63     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)
  64     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)
  65     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)
  66     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)
  67     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)
  68     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)
  69     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)
  70     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)
  71     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)
  72     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)
  73     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)
  74     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)
  75     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
  76 }
  77 static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
  78 {
  79     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
  80     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
  81     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
  82     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
  83 }
  84 static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
  85 {
  86                 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
  87     ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
  88     ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
  89     ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
  90 }
  91 static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
  92 {
  93     ZIG(0,0,0)
  94     ZIG(1,0,1)
  95     ZIG(2,1,0)
  96     ZIG(3,1,1)
  97 }
  98 #undef ZIG
  99
 100 #define ZIG(i,y,x) {\
 101     int o = x+y*i_stride;\
 102     level[i] = p_src[o] - p_dst[o];\
 103     p_dst[o] = p_src[o];\
 104 }
 105 static inline void sub_zigzag_4x4full( int level[16], const uint8_t *p_src, uint8_t *p_dst, int i_stride )
 106 {
 107     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
 108     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
 109     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
 110     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 111 }
 112 static inline void sub_zigzag_4x4( int level[15], const uint8_t *p_src, uint8_t *p_dst, int i_stride )
 113 {
 114                 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
 115     ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
 116     ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
 117     ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
 118 }
 119 #undef ZIG
 120
 121 static void quant_8x8( x264_t *h, int16_t dct[8][8], int quant_mf[6][8][8], int i_qscale, int b_intra )
 122 {
 123     const int i_qbits = 16 + i_qscale / 6;
 124     const int i_mf = i_qscale % 6;
 125     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 126     h->quantf.quant_8x8_core( dct, quant_mf[i_mf], i_qbits, f );
 127 }
 128 static void quant_4x4( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
 129 {
 130     const int i_qbits = 15 + i_qscale / 6;
 131     const int i_mf = i_qscale % 6;
 132     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 133     h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
 134 }
 135 static void quant_4x4_dc( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale )
 136 {
 137     const int i_qbits = 16 + i_qscale / 6;
 138     const int i_mf = i_qscale % 6;
 139     const int f = ( 1 << i_qbits ) / 3;
 140     h->quantf.quant_4x4_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
 141 }
 142 static void quant_2x2_dc( x264_t *h, int16_t dct[2][2], int quant_mf[6][4][4], int i_qscale, int b_intra )
 143 {
 144     const int i_qbits = 16 + i_qscale / 6;
 145     const int i_mf = i_qscale % 6;
 146     const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
 147     h->quantf.quant_2x2_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
 148 }
 149
 150 /* (ref: JVT-B118)
 151  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
 152  * to 0 (low score means set it to null)
 153  * Used in inter macroblock (luma and chroma)
 154  *  luma: for a 8x8 block: if score < 4 -> null
 155  *        for the complete mb: if score < 6 -> null
 156  *  chroma: for the complete mb: if score < 7 -> null
 157  */
 158 static int x264_mb_decimate_score( int *dct, int i_max )
 159 {
 160     static const int i_ds_table4[16] = {
 161         3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
 162     static const int i_ds_table8[64] = {
 163         3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
 164         1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
 165         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 166         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
 167
 168     const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
 169     int i_score = 0;
 170     int idx = i_max - 1;
 171
 172     while( idx >= 0 && dct[idx] == 0 )
 173         idx--;
 174
 175     while( idx >= 0 )
 176     {
 177         int i_run;
 178
 179         if( abs( dct[idx--] ) > 1 )
 180             return 9;
 181
 182         i_run = 0;
 183         while( idx >= 0 && dct[idx] == 0 )
 184         {
 185             idx--;
 186             i_run++;
 187         }
 188         i_score += ds_table[i_run];
 189     }
 190
 191     return i_score;
 192 }
 193
 194 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
 195 {
 196     const int i_stride = h->mb.pic.i_stride[0];
 197     const int i_offset = 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride;
 198     uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
 199     uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
 200     int16_t dct4x4[4][4];
 201
 202     if( h->mb.b_lossless )
 203     {
 204         sub_zigzag_4x4full( h->dct.block[idx].luma4x4, p_src, p_dst, i_stride );
 205         return;
 206     }
 207
 208     h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 209     quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
 210     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
 211     x264_mb_dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
 212
 213     /* output samples to fdec */
 214     h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 );
 215 }
 216
 217 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
 218 {
 219     const int i_stride = h->mb.pic.i_stride[0];
 220     const int i_offset = 8 * (idx&1) + 8 * (idx>>1) * i_stride;
 221     uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset];
 222     uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset];
 223     int16_t dct8x8[8][8];
 224
 225     h->dctf.sub8x8_dct8( dct8x8, p_src, i_stride, p_dst, i_stride );
 226     quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
 227     scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
 228     x264_mb_dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
 229     h->dctf.add8x8_idct8( p_dst, i_stride, dct8x8 );
 230 }
 231
 232 static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
 233 {
 234     const int i_stride = h->mb.pic.i_stride[0];
 235     uint8_t  *p_src = h->mb.pic.p_fenc[0];
 236     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 237
 238     int16_t dct4x4[16+1][4][4];
 239
 240     int i;
 241
 242     if( h->mb.b_lossless )
 243     {
 244         for( i = 0; i < 16; i++ )
 245         {
 246             int o = block_idx_x[i]*4 + block_idx_y[i]*4*i_stride;
 247             sub_zigzag_4x4( h->dct.block[i].residual_ac, p_src+o, p_dst+o, i_stride );
 248             dct4x4[0][block_idx_y[i]][block_idx_x[i]] = p_src[o] - p_dst[o];
 249             p_dst[o] = p_src[o];
 250         }
 251         scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
 252         return;
 253     }
 254
 255     h->dctf.sub16x16_dct( &dct4x4[1], p_src, i_stride, p_dst, i_stride );
 256     for( i = 0; i < 16; i++ )
 257     {
 258         /* copy dc coeff */
 259         dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
 260
 261         /* quant/scan/dequant */
 262         quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
 263         scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
 264         x264_mb_dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
 265     }
 266
 267     h->dctf.dct4x4dc( dct4x4[0] );
 268     quant_4x4_dc( h, dct4x4[0], h->quant4_mf[CQM_4IY], i_qscale );
 269     scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
 270
 271     /* output samples to fdec */
 272     h->dctf.idct4x4dc( dct4x4[0] );
 273     x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
 274
 275     /* calculate dct coeffs */
 276     for( i = 0; i < 16; i++ )
 277     {
 278         /* copy dc coeff */
 279         dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
 280     }
 281     /* put pixels to fdec */
 282     h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] );
 283 }
 284
 285 static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
 286 {
 287     int i, ch;
 288
 289     for( ch = 0; ch < 2; ch++ )
 290     {
 291         const int i_stride = h->mb.pic.i_stride[1+ch];
 292         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
 293         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 294         int i_decimate_score = 0;
 295
 296         int16_t dct2x2[2][2];
 297         int16_t dct4x4[4][4][4];
 298
 299         if( h->mb.b_lossless )
 300         {
 301             for( i = 0; i < 4; i++ )
 302             {
 303                 int o = block_idx_x[i]*4 + block_idx_y[i]*4*i_stride;
 304                 sub_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, p_src+o, p_dst+o, i_stride );
 305                 h->dct.chroma_dc[ch][i] = p_src[o] - p_dst[o];
 306                 p_dst[o] = p_src[o];
 307             }
 308             continue;
 309         }
 310
 311         h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 312         /* calculate dct coeffs */
 313         for( i = 0; i < 4; i++ )
 314         {
 315             /* copy dc coeff */
 316             dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
 317
 318             quant_4x4( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
 319             scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
 320             x264_mb_dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
 321
 322             if( b_inter )
 323             {
 324                 i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
 325             }
 326         }
 327
 328         h->dctf.dct2x2dc( dct2x2 );
 329         quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
 330         scan_zigzag_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
 331
 332         /* output samples to fdec */
 333         h->dctf.idct2x2dc( dct2x2 );
 334         x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale );  /* XXX not inversed */
 335
 336         if( b_inter && i_decimate_score < 7 )
 337         {
 338             /* Near null chroma 8x8 block so make it null (bits saving) */
 339             for( i = 0; i < 4; i++ )
 340             {
 341                 int x, y;
 342                 for( x = 0; x < 15; x++ )
 343                 {
 344                     h->dct.block[16+i+ch*4].residual_ac[x] = 0;
 345                 }
 346                 for( x = 0; x < 4; x++ )
 347                 {
 348                     for( y = 0; y < 4; y++ )
 349                     {
 350                         dct4x4[i][x][y] = 0;
 351                     }
 352                 }
 353             }
 354         }
 355
 356         /* calculate dct coeffs */
 357         for( i = 0; i < 4; i++ )
 358         {
 359             /* copy dc coeff */
 360             dct4x4[i][0][0] = dct2x2[block_idx_y[i]][block_idx_x[i]];
 361         }
 362         h->dctf.add8x8_idct( p_dst, i_stride, dct4x4 );
 363     }
 364 }
 365
 366 static void x264_macroblock_encode_skip( x264_t *h )
 367 {
 368     int i;
 369     h->mb.i_cbp_luma = 0x00;
 370     h->mb.i_cbp_chroma = 0x00;
 371
 372     for( i = 0; i < 16+8; i++ )
 373     {
 374         h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
 375     }
 376
 377     /* store cbp */
 378     h->mb.cbp[h->mb.i_mb_xy] = 0;
 379 }
 380
 381 /*****************************************************************************
 382  * x264_macroblock_encode_pskip:
 383  *  Encode an already marked skip block
 384  *****************************************************************************/
 385 void x264_macroblock_encode_pskip( x264_t *h )
 386 {
 387     const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
 388                                 h->mb.mv_min[0], h->mb.mv_max[0] );
 389     const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
 390                                 h->mb.mv_min[1], h->mb.mv_max[1] );
 391
 392     /* Motion compensation XXX probably unneeded */
 393     h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
 394                     h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
 395                     mvx, mvy, 16, 16 );
 396
 397     /* Chroma MC */
 398     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
 399                       h->mb.pic.p_fdec[1],       h->mb.pic.i_stride[1],
 400                       mvx, mvy, 8, 8 );
 401
 402     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
 403                       h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
 404                       mvx, mvy, 8, 8 );
 405
 406     x264_macroblock_encode_skip( h );
 407 }
 408
 409 /*****************************************************************************
 410  * x264_macroblock_encode:
 411  *****************************************************************************/
 412 void x264_macroblock_encode( x264_t *h )
 413 {
 414     int i_cbp_dc = 0;
 415     int i_qp = h->mb.i_qp;
 416     int i;
 417
 418     if( h->mb.i_type == P_SKIP )
 419     {
 420         /* A bit special */
 421         x264_macroblock_encode_pskip( h );
 422         return;
 423     }
 424     if( h->mb.i_type == B_SKIP )
 425     {
 426         /* XXX motion compensation is probably unneeded */
 427         x264_mb_mc( h );
 428         x264_macroblock_encode_skip( h );
 429         return;
 430     }
 431
 432     if( h->mb.i_type == I_16x16 )
 433     {
 434         const int i_mode = h->mb.i_intra16x16_pred_mode;
 435         h->mb.b_transform_8x8 = 0;
 436         /* do the right prediction */
 437         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 438
 439         /* encode the 16x16 macroblock */
 440         x264_mb_encode_i16x16( h, i_qp );
 441     }
 442     else if( h->mb.i_type == I_8x8 )
 443     {
 444         h->mb.b_transform_8x8 = 1;
 445         for( i = 0; i < 4; i++ )
 446         {
 447             const int i_dst = h->mb.pic.i_stride[0];
 448             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * i_dst];
 449             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 450
 451             h->predict_8x8[i_mode]( p_dst, i_dst, h->mb.i_neighbour8[i] );
 452             x264_mb_encode_i8x8( h, i, i_qp );
 453         }
 454     }
 455     else if( h->mb.i_type == I_4x4 )
 456     {
 457         h->mb.b_transform_8x8 = 0;
 458         for( i = 0; i < 16; i++ )
 459         {
 460             const int i_dst = h->mb.pic.i_stride[0];
 461             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * i_dst];
 462             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 463
 464             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
 465                 /* emulate missing topright samples */
 466                 *(uint32_t*) &p_dst[4 - i_dst] = p_dst[3 - i_dst] * 0x01010101U;
 467
 468             h->predict_4x4[i_mode]( p_dst, i_dst );
 469             x264_mb_encode_i4x4( h, i, i_qp );
 470         }
 471     }
 472     else    /* Inter MB */
 473     {
 474         int i8x8, i4x4, idx;
 475         int i_decimate_mb = 0;
 476
 477         /* Motion compensation */
 478         x264_mb_mc( h );
 479
 480         if( h->mb.b_lossless )
 481         {
 482             for( i4x4 = 0; i4x4 < 16; i4x4++ )
 483             {
 484                 int o = block_idx_x[i4x4]*4 + block_idx_y[i4x4]*4 * h->mb.pic.i_stride[0];
 485                 sub_zigzag_4x4full( h->dct.block[i4x4].luma4x4, h->mb.pic.p_fenc[0]+o, h->mb.pic.p_fdec[0]+o, h->mb.pic.i_stride[0] );
 486             }
 487         }
 488         else if( h->mb.b_transform_8x8 )
 489         {
 490             int16_t dct8x8[4][8][8];
 491             h->dctf.sub16x16_dct8( dct8x8,
 492                                    h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 493                                    h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 494
 495             for( idx = 0; idx < 4; idx++ )
 496             {
 497                 int i_decimate_8x8;
 498
 499                 quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
 500                 scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
 501                 x264_mb_dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
 502
 503                 i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
 504                 i_decimate_mb += i_decimate_8x8;
 505                 if( i_decimate_8x8 < 4 )
 506                 {
 507                     memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) );
 508                     memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) );
 509                 }
 510             }
 511
 512             if( i_decimate_mb < 6 )
 513                 memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) );
 514             else
 515                 h->dctf.add16x16_idct8( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct8x8 );
 516         }
 517         else
 518         {
 519             int16_t dct4x4[16][4][4];
 520             h->dctf.sub16x16_dct( dct4x4,
 521                                   h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 522                                   h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 523
 524             for( i8x8 = 0; i8x8 < 4; i8x8++ )
 525             {
 526                 int i_decimate_8x8;
 527
 528                 /* encode one 4x4 block */
 529                 i_decimate_8x8 = 0;
 530                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
 531                 {
 532                     idx = i8x8 * 4 + i4x4;
 533
 534                     quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
 535                     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
 536                     x264_mb_dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
 537
 538                     i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
 539                 }
 540
 541                 /* decimate this 8x8 block */
 542                 i_decimate_mb += i_decimate_8x8;
 543                 if( i_decimate_8x8 < 4 )
 544                 {
 545                     for( i4x4 = 0; i4x4 < 4; i4x4++ )
 546                     {
 547                         int x, y;
 548                         idx = i8x8 * 4 + i4x4;
 549                         for( i = 0; i < 16; i++ )
 550                             h->dct.block[idx].luma4x4[i] = 0;
 551                         for( x = 0; x < 4; x++ )
 552                             for( y = 0; y < 4; y++ )
 553                                 dct4x4[idx][x][y] = 0;
 554                     }
 555                 }
 556             }
 557
 558             if( i_decimate_mb < 6 )
 559                 for( idx = 0; idx < 16; idx++ )
 560                     for( i = 0; i < 16; i++ )
 561                         h->dct.block[idx].luma4x4[i] = 0;
 562             else
 563                 h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 );
 564         }
 565     }
 566
 567     /* encode chroma */
 568     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 569     if( IS_INTRA( h->mb.i_type ) )
 570     {
 571         const int i_mode = h->mb.i_chroma_pred_mode;
 572         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
 573         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
 574     }
 575
 576     /* encode the 8x8 blocks */
 577     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qp );
 578
 579     /* Calculate the Luma/Chroma patern and non_zero_count */
 580     h->mb.i_cbp_luma = 0x00;
 581     if( h->mb.i_type == I_16x16 )
 582     {
 583         for( i = 0; i < 16; i++ )
 584         {
 585             const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
 586             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
 587             if( nz > 0 )
 588                 h->mb.i_cbp_luma = 0x0f;
 589         }
 590     }
 591     else if( h->mb.b_transform_8x8 )
 592     {
 593         /* coded_block_flag is enough for CABAC.
 594          * the full non_zero_count is done only in CAVLC. */
 595         for( i = 0; i < 4; i++ )
 596         {
 597             const int nz = array_non_zero( h->dct.luma8x8[i], 64 );
 598             int j;
 599             for( j = 0; j < 4; j++ )
 600                 h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz;
 601             if( nz > 0 )
 602                 h->mb.i_cbp_luma |= 1 << i;
 603         }
 604     }
 605     else
 606     {
 607         for( i = 0; i < 16; i++ )
 608         {
 609             const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
 610             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
 611             if( nz > 0 )
 612                 h->mb.i_cbp_luma |= 1 << (i/4);
 613         }
 614     }
 615
 616     /* Calculate the chroma patern */
 617     h->mb.i_cbp_chroma = 0x00;
 618     for( i = 0; i < 8; i++ )
 619     {
 620         const int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
 621         h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
 622         if( nz > 0 )
 623         {
 624             h->mb.i_cbp_chroma = 0x02;    /* dc+ac (we can't do only ac) */
 625         }
 626     }
 627     if( h->mb.i_cbp_chroma == 0x00 &&
 628         ( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 || array_non_zero_count( h->dct.chroma_dc[1], 4 ) ) > 0 )
 629     {
 630         h->mb.i_cbp_chroma = 0x01;    /* dc only */
 631     }
 632
 633     if( h->param.b_cabac )
 634     {
 635         if( h->mb.i_type == I_16x16 && array_non_zero_count( h->dct.luma16x16_dc, 16 ) > 0 )
 636             i_cbp_dc = 0x01;
 637         else
 638             i_cbp_dc = 0x00;
 639
 640         if( array_non_zero_count( h->dct.chroma_dc[0], 4 ) > 0 )
 641             i_cbp_dc |= 0x02;
 642         if( array_non_zero_count( h->dct.chroma_dc[1], 4 ) > 0 )
 643             i_cbp_dc |= 0x04;
 644     }
 645
 646     /* store cbp */
 647     h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
 648
 649     /* Check for P_SKIP
 650      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
 651      *      (if multiple mv give same result)*/
 652     if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
 653         h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
 654         h->mb.cache.ref[0][x264_scan8[0]] == 0 )
 655     {
 656         int mvp[2];
 657
 658         x264_mb_predict_mv_pskip( h, mvp );
 659         if( h->mb.cache.mv[0][x264_scan8[0]][0] == mvp[0] &&
 660             h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
 661         {
 662             h->mb.i_type = P_SKIP;
 663         }
 664     }
 665
 666     /* Check for B_SKIP */
 667     if( h->mb.i_type == B_DIRECT &&
 668         h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
 669     {
 670         h->mb.i_type = B_SKIP;
 671     }
 672 }
 673
 674 /*****************************************************************************
 675  * x264_macroblock_probe_skip:
 676  *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
 677  *  the previous QP
 678  *****************************************************************************/
 679 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 680 {
 681     DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
 682     DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
 683     DECLARE_ALIGNED( int,     dctscan[16], 16 );
 684
 685     int i_qp = h->mb.i_qp;
 686     int mvp[2];
 687     int ch;
 688
 689     int i8x8, i4x4;
 690     int i_decimate_mb;
 691
 692     if( !b_bidir )
 693     {
 694         /* Get the MV */
 695         x264_mb_predict_mv_pskip( h, mvp );
 696         mvp[0] = x264_clip3( mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
 697         mvp[1] = x264_clip3( mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
 698
 699         /* Motion compensation */
 700         h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
 701                         h->mb.pic.p_fdec[0],   h->mb.pic.i_stride[0],
 702                         mvp[0], mvp[1], 16, 16 );
 703     }
 704
 705     /* get luma diff */
 706     h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
 707                                   h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 708
 709     for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
 710     {
 711         /* encode one 4x4 block */
 712         for( i4x4 = 0; i4x4 < 4; i4x4++ )
 713         {
 714             const int idx = i8x8 * 4 + i4x4;
 715
 716             quant_4x4( h, dct4x4[idx], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 717             scan_zigzag_4x4full( dctscan, dct4x4[idx] );
 718
 719             i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
 720
 721             if( i_decimate_mb >= 6 )
 722             {
 723                 /* not as P_SKIP */
 724                 return 0;
 725             }
 726         }
 727     }
 728
 729     /* encode chroma */
 730     i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
 731
 732     for( ch = 0; ch < 2; ch++ )
 733     {
 734         const int i_stride = h->mb.pic.i_stride[1+ch];
 735         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
 736         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 737
 738         if( !b_bidir )
 739         {
 740             h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4+ch], i_stride,
 741                               h->mb.pic.p_fdec[1+ch],       i_stride,
 742                               mvp[0], mvp[1], 8, 8 );
 743         }
 744
 745         h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 746
 747         /* calculate dct DC */
 748         dct2x2[0][0] = dct4x4[0][0][0];
 749         dct2x2[0][1] = dct4x4[1][0][0];
 750         dct2x2[1][0] = dct4x4[2][0][0];
 751         dct2x2[1][1] = dct4x4[3][0][0];
 752         h->dctf.dct2x2dc( dct2x2 );
 753         quant_2x2_dc( h, dct2x2, (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 754         if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
 755         {
 756             /* can't be */
 757             return 0;
 758         }
 759
 760         /* calculate dct coeffs */
 761         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
 762         {
 763             quant_4x4( h, dct4x4[i4x4], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
 764             scan_zigzag_4x4( dctscan, dct4x4[i4x4] );
 765
 766             i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
 767             if( i_decimate_mb >= 7 )
 768             {
 769                 return 0;
 770             }
 771         }
 772     }
 773
 774     return 1;
 775 }