1 /*****************************************************************************
2 * macroblock.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003 Laurent Aimar
5 * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
7 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
27 #include "common/common.h"
28 #include "macroblock.h"
31 /* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
32 /* FIXME this seems to make better decisions with cqm=jvt, but could screw up
33 * with general custom matrices. */
34 static const int def_quant4_mf[6][4][4] =
36 { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
37 { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
38 { { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 },
39 { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 } },
40 { { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 },
41 { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 } },
42 { { 9362, 5825, 9362, 5825 }, { 5825, 3647, 5825, 3647 },
43 { 9362, 5825, 9362, 5825 }, { 5825, 3647, 5825, 3647 } },
44 { { 8192, 5243, 8192, 5243 }, { 5243, 3355, 5243, 3355 },
45 { 8192, 5243, 8192, 5243 }, { 5243, 3355, 5243, 3355 } },
46 { { 7282, 4559, 7282, 4559 }, { 4559, 2893, 4559, 2893 },
47 { 7282, 4559, 7282, 4559 }, { 4559, 2893, 4559, 2893 } }
50 /****************************************************************************
51 * Scan and Quant functions
52 ****************************************************************************/
54 #define ZIG(i,y,x) level[i] = dct[x][y];
55 static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] )
57 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
58 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
59 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)
60 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)
61 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)
62 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)
63 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)
64 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)
65 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)
66 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)
67 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)
68 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)
69 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)
70 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)
71 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)
72 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
74 static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] )
76 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
77 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
78 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
79 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
81 static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] )
83 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
84 ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
85 ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
86 ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
88 static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] )
98 int oe = x+y*FENC_STRIDE;\
99 int od = x+y*FDEC_STRIDE;\
100 level[i] = p_src[oe] - p_dst[od];\
101 p_dst[od] = p_src[oe];\
103 static inline void sub_zigzag_4x4full( int level[16], const uint8_t *p_src, uint8_t *p_dst )
105 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
106 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
107 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
108 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
110 static inline void sub_zigzag_4x4( int level[15], const uint8_t *p_src, uint8_t *p_dst )
112 ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
113 ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
114 ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
115 ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
119 static void quant_8x8( x264_t *h, int16_t dct[8][8], int quant_mf[6][8][8], int i_qscale, int b_intra )
121 const int i_qbits = 16 + i_qscale / 6;
122 const int i_mf = i_qscale % 6;
123 const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
124 h->quantf.quant_8x8_core( dct, quant_mf[i_mf], i_qbits, f );
126 static void quant_4x4( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale, int b_intra )
128 const int i_qbits = 15 + i_qscale / 6;
129 const int i_mf = i_qscale % 6;
130 const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
131 h->quantf.quant_4x4_core( dct, quant_mf[i_mf], i_qbits, f );
133 static void quant_4x4_dc( x264_t *h, int16_t dct[4][4], int quant_mf[6][4][4], int i_qscale )
135 const int i_qbits = 16 + i_qscale / 6;
136 const int i_mf = i_qscale % 6;
137 const int f = ( 1 << i_qbits ) / 3;
138 h->quantf.quant_4x4_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
140 static void quant_2x2_dc( x264_t *h, int16_t dct[2][2], int quant_mf[6][4][4], int i_qscale, int b_intra )
142 const int i_qbits = 16 + i_qscale / 6;
143 const int i_mf = i_qscale % 6;
144 const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 );
145 h->quantf.quant_2x2_dc_core( dct, quant_mf[i_mf][0][0], i_qbits, f );
149 * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
150 * to 0 (low score means set it to null)
151 * Used in inter macroblock (luma and chroma)
152 * luma: for a 8x8 block: if score < 4 -> null
153 * for the complete mb: if score < 6 -> null
154 * chroma: for the complete mb: if score < 7 -> null
156 static int x264_mb_decimate_score( int *dct, int i_max )
158 static const int i_ds_table4[16] = {
159 3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
160 static const int i_ds_table8[64] = {
161 3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
162 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
163 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
164 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
166 const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
170 while( idx >= 0 && dct[idx] == 0 )
177 if( abs( dct[idx--] ) > 1 )
181 while( idx >= 0 && dct[idx] == 0 )
186 i_score += ds_table[i_run];
192 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
194 int x = 4 * block_idx_x[idx];
195 int y = 4 * block_idx_y[idx];
196 uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
197 uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
198 int16_t dct4x4[4][4];
200 if( h->mb.b_lossless )
202 sub_zigzag_4x4full( h->dct.block[idx].luma4x4, p_src, p_dst );
206 h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
208 if( h->mb.b_trellis )
209 x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
211 quant_4x4( h, dct4x4, h->quant4_mf[CQM_4IY], i_qscale, 1 );
213 scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 );
214 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
216 /* output samples to fdec */
217 h->dctf.add4x4_idct( p_dst, dct4x4 );
220 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
223 int y = 8 * (idx>>1);
224 uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
225 uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
226 int16_t dct8x8[8][8];
228 h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
230 if( h->mb.b_trellis )
231 x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
233 quant_8x8( h, dct8x8, h->quant8_mf[CQM_8IY], i_qscale, 1 );
235 scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 );
236 h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
237 h->dctf.add8x8_idct8( p_dst, dct8x8 );
240 static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
242 uint8_t *p_src = h->mb.pic.p_fenc[0];
243 uint8_t *p_dst = h->mb.pic.p_fdec[0];
245 int16_t dct4x4[16+1][4][4];
249 if( h->mb.b_lossless )
251 for( i = 0; i < 16; i++ )
253 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
254 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
255 sub_zigzag_4x4( h->dct.block[i].residual_ac, p_src+oe, p_dst+od );
256 dct4x4[0][block_idx_y[i]][block_idx_x[i]] = p_src[oe] - p_dst[od];
257 p_dst[od] = p_src[oe];
259 scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
263 h->dctf.sub16x16_dct( &dct4x4[1], p_src, p_dst );
264 for( i = 0; i < 16; i++ )
267 dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
269 /* quant/scan/dequant */
270 if( h->mb.b_trellis )
271 x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
273 quant_4x4( h, dct4x4[1+i], h->quant4_mf[CQM_4IY], i_qscale, 1 );
275 scan_zigzag_4x4( h->dct.block[i].residual_ac, dct4x4[1+i] );
276 h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
279 h->dctf.dct4x4dc( dct4x4[0] );
280 quant_4x4_dc( h, dct4x4[0], h->quant4_mf[CQM_4IY], i_qscale );
281 scan_zigzag_4x4full( h->dct.luma16x16_dc, dct4x4[0] );
283 /* output samples to fdec */
284 h->dctf.idct4x4dc( dct4x4[0] );
285 x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale ); /* XXX not inversed */
287 /* calculate dct coeffs */
288 for( i = 0; i < 16; i++ )
291 dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
293 /* put pixels to fdec */
294 h->dctf.add16x16_idct( p_dst, &dct4x4[1] );
297 static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
300 int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
302 for( ch = 0; ch < 2; ch++ )
304 uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
305 uint8_t *p_dst = h->mb.pic.p_fdec[1+ch];
306 int i_decimate_score = 0;
308 int16_t dct2x2[2][2];
309 int16_t dct4x4[4][4][4];
311 if( h->mb.b_lossless )
313 for( i = 0; i < 4; i++ )
315 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
316 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
317 sub_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, p_src+oe, p_dst+od );
318 h->dct.chroma_dc[ch][i] = p_src[oe] - p_dst[od];
319 p_dst[od] = p_src[oe];
324 h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
325 /* calculate dct coeffs */
326 for( i = 0; i < 4; i++ )
329 dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
331 /* no trellis; it doesn't seem to help chroma noticeably */
332 quant_4x4( h, dct4x4[i], h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
333 scan_zigzag_4x4( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
337 i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
341 h->dctf.dct2x2dc( dct2x2 );
342 quant_2x2_dc( h, dct2x2, h->quant4_mf[CQM_4IC + b_inter], i_qscale, !b_inter );
343 scan_zigzag_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
345 /* output samples to fdec */
346 h->dctf.idct2x2dc( dct2x2 );
347 x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale ); /* XXX not inversed */
349 if( b_decimate && i_decimate_score < 7 )
351 /* Near null chroma 8x8 block so make it null (bits saving) */
352 memset( &h->dct.block[16+ch*4], 0, 4 * sizeof( *h->dct.block ) );
353 if( !array_non_zero( (int*)dct2x2, sizeof(dct2x2)/sizeof(int) ) )
355 memset( dct4x4, 0, sizeof( dct4x4 ) );
359 for( i = 0; i < 4; i++ )
360 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
363 for( i = 0; i < 4; i++ )
364 dct4x4[i][0][0] = dct2x2[0][i];
365 h->dctf.add8x8_idct( p_dst, dct4x4 );
369 static void x264_macroblock_encode_skip( x264_t *h )
372 h->mb.i_cbp_luma = 0x00;
373 h->mb.i_cbp_chroma = 0x00;
375 for( i = 0; i < 16+8; i++ )
377 h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
381 h->mb.cbp[h->mb.i_mb_xy] = 0;
384 /*****************************************************************************
385 * x264_macroblock_encode_pskip:
386 * Encode an already marked skip block
387 *****************************************************************************/
388 void x264_macroblock_encode_pskip( x264_t *h )
390 const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
391 h->mb.mv_min[0], h->mb.mv_max[0] );
392 const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
393 h->mb.mv_min[1], h->mb.mv_max[1] );
395 /* Motion compensation XXX probably unneeded */
396 h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
397 h->mb.pic.p_fdec[0], FDEC_STRIDE,
401 h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
402 h->mb.pic.p_fdec[1], FDEC_STRIDE,
405 h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
406 h->mb.pic.p_fdec[2], FDEC_STRIDE,
409 x264_macroblock_encode_skip( h );
412 /*****************************************************************************
413 * x264_macroblock_encode:
414 *****************************************************************************/
415 void x264_macroblock_encode( x264_t *h )
418 int i_qp = h->mb.i_qp;
419 int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
422 if( h->mb.i_type == P_SKIP )
425 x264_macroblock_encode_pskip( h );
428 if( h->mb.i_type == B_SKIP )
430 /* XXX motion compensation is probably unneeded */
432 x264_macroblock_encode_skip( h );
436 if( h->mb.i_type == I_16x16 )
438 const int i_mode = h->mb.i_intra16x16_pred_mode;
439 h->mb.b_transform_8x8 = 0;
440 /* do the right prediction */
441 h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
443 /* encode the 16x16 macroblock */
444 x264_mb_encode_i16x16( h, i_qp );
446 else if( h->mb.i_type == I_8x8 )
448 h->mb.b_transform_8x8 = 1;
449 for( i = 0; i < 4; i++ )
451 uint8_t *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
452 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
454 h->predict_8x8[i_mode]( p_dst, h->mb.i_neighbour8[i] );
455 x264_mb_encode_i8x8( h, i, i_qp );
458 else if( h->mb.i_type == I_4x4 )
460 h->mb.b_transform_8x8 = 0;
461 for( i = 0; i < 16; i++ )
463 uint8_t *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * FDEC_STRIDE];
464 int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
466 if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
467 /* emulate missing topright samples */
468 *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
470 h->predict_4x4[i_mode]( p_dst );
471 x264_mb_encode_i4x4( h, i, i_qp );
477 int i_decimate_mb = 0;
479 /* Motion compensation */
482 if( h->mb.b_lossless )
484 for( i4x4 = 0; i4x4 < 16; i4x4++ )
486 int x = 4*block_idx_x[i4x4];
487 int y = 4*block_idx_y[i4x4];
488 sub_zigzag_4x4full( h->dct.block[i4x4].luma4x4,
489 h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
490 h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
493 else if( h->mb.b_transform_8x8 )
495 int16_t dct8x8[4][8][8];
496 int nnz8x8[4] = {1,1,1,1};
497 b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
498 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
500 for( idx = 0; idx < 4; idx++ )
502 if( h->mb.b_noise_reduction )
503 x264_denoise_dct( h, (int16_t*)dct8x8[idx] );
504 if( h->mb.b_trellis )
505 x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
507 quant_8x8( h, dct8x8[idx], h->quant8_mf[CQM_8PY], i_qp, 0 );
509 scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
513 int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
514 i_decimate_mb += i_decimate_8x8;
515 if( i_decimate_8x8 < 4 )
517 memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) );
518 memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) );
523 nnz8x8[idx] = array_non_zero( (int*)dct8x8[idx], sizeof(*dct8x8)/sizeof(int) );
526 if( i_decimate_mb < 6 && b_decimate )
527 memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) );
530 for( idx = 0; idx < 4; idx++ )
533 h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
534 h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
540 int16_t dct4x4[16][4][4];
541 int nnz8x8[4] = {1,1,1,1};
542 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
544 for( i8x8 = 0; i8x8 < 4; i8x8++ )
548 /* encode one 4x4 block */
550 for( i4x4 = 0; i4x4 < 4; i4x4++ )
552 idx = i8x8 * 4 + i4x4;
554 if( h->mb.b_noise_reduction )
555 x264_denoise_dct( h, (int16_t*)dct4x4[idx] );
556 if( h->mb.b_trellis )
557 x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
559 quant_4x4( h, dct4x4[idx], h->quant4_mf[CQM_4PY], i_qp, 0 );
561 scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
564 i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
567 /* decimate this 8x8 block */
568 i_decimate_mb += i_decimate_8x8;
569 if( i_decimate_8x8 < 4 && b_decimate )
571 memset( &dct4x4[i8x8*4], 0, 4 * sizeof( *dct4x4 ) );
572 memset( &h->dct.block[i8x8*4], 0, 4 * sizeof( *h->dct.block ) );
577 if( i_decimate_mb < 6 && b_decimate )
578 memset( h->dct.block, 0, 16 * sizeof( *h->dct.block ) );
581 for( i8x8 = 0; i8x8 < 4; i8x8++ )
584 for( i = 0; i < 4; i++ )
585 h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
586 h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
593 i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
594 if( IS_INTRA( h->mb.i_type ) )
596 const int i_mode = h->mb.i_chroma_pred_mode;
597 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
598 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
601 /* encode the 8x8 blocks */
602 x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qp );
604 /* Calculate the Luma/Chroma patern and non_zero_count */
605 h->mb.i_cbp_luma = 0x00;
606 if( h->mb.i_type == I_16x16 )
608 for( i = 0; i < 16; i++ )
610 const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
611 h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
613 h->mb.i_cbp_luma = 0x0f;
616 else if( h->mb.b_transform_8x8 )
618 /* coded_block_flag is enough for CABAC.
619 * the full non_zero_count is done only in CAVLC. */
620 for( i = 0; i < 4; i++ )
622 const int nz = array_non_zero( h->dct.luma8x8[i], 64 );
624 for( j = 0; j < 4; j++ )
625 h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz;
627 h->mb.i_cbp_luma |= 1 << i;
632 for( i = 0; i < 16; i++ )
634 const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
635 h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
637 h->mb.i_cbp_luma |= 1 << (i/4);
641 /* Calculate the chroma patern */
642 h->mb.i_cbp_chroma = 0x00;
643 for( i = 0; i < 8; i++ )
645 const int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
646 h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
649 h->mb.i_cbp_chroma = 0x02; /* dc+ac (we can't do only ac) */
652 if( h->mb.i_cbp_chroma == 0x00 && array_non_zero( h->dct.chroma_dc[0], 8 ) )
654 h->mb.i_cbp_chroma = 0x01; /* dc only */
657 if( h->param.b_cabac )
659 i_cbp_dc = ( h->mb.i_type == I_16x16 && array_non_zero( h->dct.luma16x16_dc, 16 ) )
660 | array_non_zero( h->dct.chroma_dc[0], 4 ) << 1
661 | array_non_zero( h->dct.chroma_dc[1], 4 ) << 2;
665 h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
668 * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
669 * (if multiple mv give same result)*/
670 if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
671 h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 &&
672 h->mb.cache.ref[0][x264_scan8[0]] == 0 )
676 x264_mb_predict_mv_pskip( h, mvp );
677 if( h->mb.cache.mv[0][x264_scan8[0]][0] == mvp[0] &&
678 h->mb.cache.mv[0][x264_scan8[0]][1] == mvp[1] )
680 h->mb.i_type = P_SKIP;
684 /* Check for B_SKIP */
685 if( h->mb.i_type == B_DIRECT &&
686 h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
688 h->mb.i_type = B_SKIP;
692 /*****************************************************************************
693 * x264_macroblock_probe_skip:
694 * Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
696 *****************************************************************************/
697 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
699 DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
700 DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
701 DECLARE_ALIGNED( int, dctscan[16], 16 );
703 int i_qp = h->mb.i_qp;
713 x264_mb_predict_mv_pskip( h, mvp );
714 mvp[0] = x264_clip3( mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
715 mvp[1] = x264_clip3( mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
717 /* Motion compensation */
718 h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
719 h->mb.pic.p_fdec[0], FDEC_STRIDE,
720 mvp[0], mvp[1], 16, 16 );
724 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0],
725 h->mb.pic.p_fdec[0] );
727 for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
729 /* encode one 4x4 block */
730 for( i4x4 = 0; i4x4 < 4; i4x4++ )
732 const int idx = i8x8 * 4 + i4x4;
734 quant_4x4( h, dct4x4[idx], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
735 scan_zigzag_4x4full( dctscan, dct4x4[idx] );
737 i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
739 if( i_decimate_mb >= 6 )
748 i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
750 for( ch = 0; ch < 2; ch++ )
752 uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
753 uint8_t *p_dst = h->mb.pic.p_fdec[1+ch];
757 h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
758 h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
759 mvp[0], mvp[1], 8, 8 );
762 h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
764 /* calculate dct DC */
765 dct2x2[0][0] = dct4x4[0][0][0];
766 dct2x2[0][1] = dct4x4[1][0][0];
767 dct2x2[1][0] = dct4x4[2][0][0];
768 dct2x2[1][1] = dct4x4[3][0][0];
769 h->dctf.dct2x2dc( dct2x2 );
770 quant_2x2_dc( h, dct2x2, (int(*)[4][4])def_quant4_mf, i_qp, 0 );
771 if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1] )
777 /* calculate dct coeffs */
778 for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
780 quant_4x4( h, dct4x4[i4x4], (int(*)[4][4])def_quant4_mf, i_qp, 0 );
781 scan_zigzag_4x4( dctscan, dct4x4[i4x4] );
783 i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
784 if( i_decimate_mb >= 7 )
794 /****************************************************************************
795 * DCT-domain noise reduction / adaptive deadzone
797 ****************************************************************************/
799 void x264_noise_reduction_update( x264_t *h )
802 for( cat = 0; cat < 2; cat++ )
804 int size = cat ? 64 : 16;
805 const int *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
807 if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
809 for( i = 0; i < size; i++ )
810 h->nr_residual_sum[cat][i] >>= 1;
811 h->nr_count[cat] >>= 1;
814 for( i = 0; i < size; i++ )
815 h->nr_offset[cat][i] =
816 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
817 + h->nr_residual_sum[cat][i]/2)
818 / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
822 void x264_denoise_dct( x264_t *h, int16_t *dct )
824 const int cat = h->mb.b_transform_8x8;
829 for( i = (cat ? 63 : 15); i >= 1; i-- )
836 h->nr_residual_sum[cat][i] += level;
837 level -= h->nr_offset[cat][i];
843 h->nr_residual_sum[cat][i] -= level;
844 level += h->nr_offset[cat][i];
853 /*****************************************************************************
854 * RD only; 4 calls to this do not make up for one macroblock_encode.
855 * doesn't transform chroma dc.
856 *****************************************************************************/
857 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
859 int i_qp = h->mb.i_qp;
860 uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
861 uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
862 int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
866 x264_mb_mc_8x8( h, i8 );
868 if( h->mb.b_transform_8x8 )
870 int16_t dct8x8[8][8];
871 h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
872 quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
873 scan_zigzag_8x8full( h->dct.luma8x8[i8], dct8x8 );
876 nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
878 nnz8x8 = array_non_zero( (int*)dct8x8, sizeof(dct8x8)/sizeof(int) );
882 h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
883 h->dctf.add8x8_idct8( p_fdec, dct8x8 );
889 int16_t dct4x4[4][4][4];
890 h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
891 quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 );
892 quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 );
893 quant_4x4( h, dct4x4[2], h->quant4_mf[CQM_4PY], i_qp, 0 );
894 quant_4x4( h, dct4x4[3], h->quant4_mf[CQM_4PY], i_qp, 0 );
895 for( i4 = 0; i4 < 4; i4++ )
896 scan_zigzag_4x4full( h->dct.block[i8*4+i4].luma4x4, dct4x4[i4] );
900 int i_decimate_8x8 = 0;
901 for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
902 i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[i8*4+i4].luma4x4, 16 );
903 nnz8x8 = 4 <= i_decimate_8x8;
906 nnz8x8 = array_non_zero( (int*)dct4x4, sizeof(dct4x4)/sizeof(int) );
910 for( i4 = 0; i4 < 4; i4++ )
911 h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
912 h->dctf.add8x8_idct( p_fdec, dct4x4 );
916 i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
918 for( ch = 0; ch < 2; ch++ )
920 int16_t dct4x4[4][4];
921 p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
922 p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
924 h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
925 quant_4x4( h, dct4x4, h->quant4_mf[CQM_4PC], i_qp, 0 );
926 scan_zigzag_4x4( h->dct.block[16+i8+ch*4].residual_ac, dct4x4 );
927 if( array_non_zero( (int*)dct4x4, sizeof(dct4x4)/sizeof(int) ) )
929 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
930 h->dctf.add4x4_idct( p_fdec, dct4x4 );
935 h->mb.i_cbp_luma |= (1 << i8);
937 h->mb.i_cbp_luma &= ~(1 << i8);
938 h->mb.i_cbp_chroma = 0x02;