]> git.sesse.net Git - x264/blob - encoder/macroblock.c
4342159e78126e13110de3b52673d0bd35d1fc73
[x264] / encoder / macroblock.c
1 /*****************************************************************************
2  * macroblock.c: h264 encoder library
3  *****************************************************************************
4  * Copyright (C) 2003 Laurent Aimar
5  * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
6  *
7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
22  *****************************************************************************/
23
24 #include "common/common.h"
25 #include "macroblock.h"
26
27
28 #define ZIG(i,y,x) level[i] = dct[x][y];
29 static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
30 {
31     ZIG(0,0,0)
32     ZIG(1,0,1)
33     ZIG(2,1,0)
34     ZIG(3,1,1)
35 }
36 #undef ZIG
37
38 /* (ref: JVT-B118)
39  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
40  * to 0 (low score means set it to null)
41  * Used in inter macroblock (luma and chroma)
42  *  luma: for a 8x8 block: if score < 4 -> null
43  *        for the complete mb: if score < 6 -> null
44  *  chroma: for the complete mb: if score < 7 -> null
45  */
46 static int x264_mb_decimate_score( int16_t *dct, int i_max )
47 {
48     static const int i_ds_table4[16] = {
49         3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
50     static const int i_ds_table8[64] = {
51         3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
52         1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
53         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
55
56     const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
57     int i_score = 0;
58     int idx = i_max - 1;
59
60     while( idx >= 0 && dct[idx] == 0 )
61         idx--;
62
63     while( idx >= 0 )
64     {
65         int i_run;
66
67         if( abs( dct[idx--] ) > 1 )
68             return 9;
69
70         i_run = 0;
71         while( idx >= 0 && dct[idx] == 0 )
72         {
73             idx--;
74             i_run++;
75         }
76         i_score += ds_table[i_run];
77     }
78
79     return i_score;
80 }
81
82 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
83 {
84     int x = 4 * block_idx_x[idx];
85     int y = 4 * block_idx_y[idx];
86     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
87     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
88     DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
89
90     if( h->mb.b_lossless )
91     {
92         h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
93         return;
94     }
95
96     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
97
98     if( h->mb.b_trellis )
99         x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
100     else
101         h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
102
103     h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
104     h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
105
106     /* output samples to fdec */
107     h->dctf.add4x4_idct( p_dst, dct4x4 );
108 }
109
110 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
111 {
112     int x = 8 * (idx&1);
113     int y = 8 * (idx>>1);
114     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
115     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
116     DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
117
118     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
119
120     if( h->mb.b_trellis )
121         x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
122     else 
123         h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] );
124
125     h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
126     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
127     h->dctf.add8x8_idct8( p_dst, dct8x8 );
128 }
129
130 static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
131 {
132     uint8_t  *p_src = h->mb.pic.p_fenc[0];
133     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
134
135     DECLARE_ALIGNED_16( int16_t dct4x4[16+1][4][4] );
136
137     int i;
138
139     if( h->mb.b_lossless )
140     {
141         for( i = 0; i < 16; i++ )
142         {
143             int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
144             int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
145             h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
146             dct4x4[0][block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0];
147         }
148         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
149         return;
150     }
151
152     h->dctf.sub16x16_dct( &dct4x4[1], p_src, p_dst );
153     for( i = 0; i < 16; i++ )
154     {
155         /* copy dc coeff */
156         dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
157         dct4x4[1+i][0][0] = 0;
158
159         /* quant/scan/dequant */
160         if( h->mb.b_trellis )
161             x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
162         else
163             h->quantf.quant_4x4( dct4x4[1+i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
164
165         h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[1+i] );
166         h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
167     }
168
169     h->dctf.dct4x4dc( dct4x4[0] );
170     h->quantf.quant_4x4_dc( dct4x4[0], h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
171     h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
172
173     /* output samples to fdec */
174     h->dctf.idct4x4dc( dct4x4[0] );
175     x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
176
177     /* calculate dct coeffs */
178     for( i = 0; i < 16; i++ )
179     {
180         /* copy dc coeff */
181         dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
182     }
183     /* put pixels to fdec */
184     h->dctf.add16x16_idct( p_dst, &dct4x4[1] );
185 }
186
187 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
188 {
189     int i, ch;
190     int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
191
192     for( ch = 0; ch < 2; ch++ )
193     {
194         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
195         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
196         int i_decimate_score = 0;
197
198         DECLARE_ALIGNED_16( int16_t dct2x2[2][2]  );
199         DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
200
201         if( h->mb.b_lossless )
202         {
203             for( i = 0; i < 4; i++ )
204             {
205                 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
206                 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
207                 h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od );
208                 h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0];
209             }
210             continue;
211         }
212             
213         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
214         /* calculate dct coeffs */
215         for( i = 0; i < 4; i++ )
216         {
217             /* copy dc coeff */
218             dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
219             dct4x4[i][0][0] = 0;
220
221             /* no trellis; it doesn't seem to help chroma noticeably */
222             h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] );
223             h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
224
225             if( b_decimate )
226             {
227                 i_decimate_score += x264_mb_decimate_score( h->dct.luma4x4[16+i+ch*4]+1, 15 );
228             }
229         }
230
231         h->dctf.dct2x2dc( dct2x2 );
232         h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qscale][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qscale][0]<<1 );
233         zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
234
235         /* output samples to fdec */
236         h->dctf.idct2x2dc( dct2x2 );
237         x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale );  /* XXX not inversed */
238
239         if( b_decimate && i_decimate_score < 7 )
240         {
241             /* Near null chroma 8x8 block so make it null (bits saving) */
242             memset( &h->dct.luma4x4[16+ch*4], 0, 4 * sizeof( *h->dct.luma4x4 ) );
243             if( !array_non_zero( dct2x2 ) )
244                 continue;
245             memset( dct4x4, 0, sizeof( dct4x4 ) );
246         }
247         else
248         {
249             for( i = 0; i < 4; i++ )
250                 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
251         }
252
253         for( i = 0; i < 4; i++ )
254             dct4x4[i][0][0] = dct2x2[0][i];
255         h->dctf.add8x8_idct( p_dst, dct4x4 );
256     }
257
258     /* coded block pattern */
259     h->mb.i_cbp_chroma = 0;
260     for( i = 0; i < 8; i++ )
261     {
262         int nz = array_non_zero( h->dct.luma4x4[16+i] );
263         h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
264         h->mb.i_cbp_chroma |= nz;
265     }
266     if( h->mb.i_cbp_chroma )
267         h->mb.i_cbp_chroma = 2;    /* dc+ac (we can't do only ac) */
268     else if( array_non_zero( h->dct.chroma_dc ) )
269         h->mb.i_cbp_chroma = 1;    /* dc only */
270 }
271
272 static void x264_macroblock_encode_skip( x264_t *h )
273 {
274     int i;
275     h->mb.i_cbp_luma = 0x00;
276     h->mb.i_cbp_chroma = 0x00;
277
278     for( i = 0; i < 16+8; i++ )
279     {
280         h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
281     }
282
283     /* store cbp */
284     h->mb.cbp[h->mb.i_mb_xy] = 0;
285 }
286
287 /*****************************************************************************
288  * x264_macroblock_encode_pskip:
289  *  Encode an already marked skip block
290  *****************************************************************************/
291 void x264_macroblock_encode_pskip( x264_t *h )
292 {
293     const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
294                                 h->mb.mv_min[0], h->mb.mv_max[0] );
295     const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
296                                 h->mb.mv_min[1], h->mb.mv_max[1] );
297
298     /* don't do pskip motion compensation if it was already done in macroblock_analyse */
299     if( !h->mb.b_skip_pbskip_mc )
300     {
301         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
302                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
303                        mvx, mvy, 16, 16 );
304
305         h->mc.mc_chroma( h->mb.pic.p_fdec[1],       FDEC_STRIDE,
306                          h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
307                          mvx, mvy, 8, 8 );
308
309         h->mc.mc_chroma( h->mb.pic.p_fdec[2],       FDEC_STRIDE,
310                          h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
311                          mvx, mvy, 8, 8 );
312     }
313
314     x264_macroblock_encode_skip( h );
315 }
316
317 /*****************************************************************************
318  * x264_macroblock_encode:
319  *****************************************************************************/
320 void x264_macroblock_encode( x264_t *h )
321 {
322     int i_cbp_dc = 0;
323     int i_qp = h->mb.i_qp;
324     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
325     int b_force_no_skip = 0;
326     int i,j,idx;
327     uint8_t nnz8x8[4] = {1,1,1,1};
328
329     if( h->sh.b_mbaff
330         && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
331         && IS_SKIP(h->mb.type[h->sh.i_first_mb]) )
332     {
333         /* The first skip is predicted to be a frame mb pair.
334          * We don't yet support the aff part of mbaff, so force it to non-skip
335          * so that we can pick the aff flag. */
336         b_force_no_skip = 1;
337         if( IS_SKIP(h->mb.i_type) )
338         {
339             if( h->mb.i_type == P_SKIP )
340                 h->mb.i_type = P_L0;
341             else if( h->mb.i_type == B_SKIP )
342                 h->mb.i_type = B_DIRECT;
343         }
344     }
345
346     if( h->mb.i_type == P_SKIP )
347     {
348         /* A bit special */
349         x264_macroblock_encode_pskip( h );
350         return;
351     }
352     if( h->mb.i_type == B_SKIP )
353     {
354         /* don't do bskip motion compensation if it was already done in macroblock_analyse */
355         if( !h->mb.b_skip_pbskip_mc )
356             x264_mb_mc( h );
357         x264_macroblock_encode_skip( h );
358         return;
359     }
360
361     if( h->mb.i_type == I_16x16 )
362     {
363         const int i_mode = h->mb.i_intra16x16_pred_mode;
364         h->mb.b_transform_8x8 = 0;
365         /* do the right prediction */
366         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
367
368         /* encode the 16x16 macroblock */
369         x264_mb_encode_i16x16( h, i_qp );
370     }
371     else if( h->mb.i_type == I_8x8 )
372     {
373         DECLARE_ALIGNED_16( uint8_t edge[33] );
374         h->mb.b_transform_8x8 = 1;
375         /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
376         if( h->mb.i_skip_intra )
377         {
378             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
379             /* In RD mode, restore the now-overwritten DCT data. */
380             if( h->mb.i_skip_intra == 2 )
381                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
382         }
383         for( i = h->mb.i_skip_intra ? 3 : 0 ; i < 4; i++ )
384         {
385             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
386             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
387
388             x264_predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
389             h->predict_8x8[i_mode]( p_dst, edge );
390             x264_mb_encode_i8x8( h, i, i_qp );
391         }
392         for( i = 0; i < 4; i++ )
393             nnz8x8[i] = array_non_zero( h->dct.luma8x8[i] );
394     }
395     else if( h->mb.i_type == I_4x4 )
396     {
397         h->mb.b_transform_8x8 = 0;
398         /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
399         if( h->mb.i_skip_intra )
400         {
401             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
402             /* In RD mode, restore the now-overwritten DCT data. */
403             if( h->mb.i_skip_intra == 2 )
404                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
405         }
406         for( i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
407         {
408             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * FDEC_STRIDE];
409             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
410
411             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
412                 /* emulate missing topright samples */
413                 *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
414
415             h->predict_4x4[i_mode]( p_dst );
416             x264_mb_encode_i4x4( h, i, i_qp );
417         }
418     }
419     else    /* Inter MB */
420     {
421         int i8x8, i4x4;
422         int i_decimate_mb = 0;
423
424         /* Motion compensation */
425         x264_mb_mc( h );
426
427         if( h->mb.b_lossless )
428         {
429             for( i4x4 = 0; i4x4 < 16; i4x4++ )
430             {
431                 int x = 4*block_idx_x[i4x4];
432                 int y = 4*block_idx_y[i4x4];
433                 h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
434                                     h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
435                                     h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
436             }
437         }
438         else if( h->mb.b_transform_8x8 )
439         {
440             DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
441             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
442             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
443
444             for( idx = 0; idx < 4; idx++ )
445             {
446                 if( h->mb.b_noise_reduction )
447                     x264_denoise_dct( h, (int16_t*)dct8x8[idx] );
448                 if( h->mb.b_trellis )
449                     x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
450                 else
451                     h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
452
453                 h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
454
455                 if( b_decimate )
456                 {
457                     int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
458                     i_decimate_mb += i_decimate_8x8;
459                     if( i_decimate_8x8 < 4 )
460                         nnz8x8[idx] = 0;
461                 }
462                 else
463                     nnz8x8[idx] = array_non_zero( dct8x8[idx] );
464             }
465
466             if( i_decimate_mb < 6 && b_decimate )
467                 *(uint32_t*)nnz8x8 = 0;
468             else
469             {
470                 for( idx = 0; idx < 4; idx++ )
471                     if( nnz8x8[idx] )
472                     {
473                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
474                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
475                     }
476             }
477         }
478         else
479         {
480             DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
481             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
482
483             for( i8x8 = 0; i8x8 < 4; i8x8++ )
484             {
485                 int i_decimate_8x8;
486
487                 /* encode one 4x4 block */
488                 i_decimate_8x8 = 0;
489                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
490                 {
491                     idx = i8x8 * 4 + i4x4;
492
493                     if( h->mb.b_noise_reduction )
494                         x264_denoise_dct( h, (int16_t*)dct4x4[idx] );
495                     if( h->mb.b_trellis )
496                         x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
497                     else
498                         h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
499
500                     h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
501                     
502                     if( b_decimate )
503                         i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 );
504                 }
505
506                 /* decimate this 8x8 block */
507                 i_decimate_mb += i_decimate_8x8;
508                 if( i_decimate_8x8 < 4 && b_decimate )
509                     nnz8x8[i8x8] = 0;
510             }
511
512             if( i_decimate_mb < 6 && b_decimate )
513                 *(uint32_t*)nnz8x8 = 0;
514             else
515             {
516                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
517                     if( nnz8x8[i8x8] )
518                     {
519                         for( i = 0; i < 4; i++ )
520                             h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
521                         h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
522                     }
523             }
524         }
525     }
526
527     /* encode chroma */
528     if( IS_INTRA( h->mb.i_type ) )
529     {
530         const int i_mode = h->mb.i_chroma_pred_mode;
531         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
532         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
533     }
534
535     /* encode the 8x8 blocks */
536     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
537
538     /* coded block pattern and non_zero_count */
539     h->mb.i_cbp_luma = 0x00;
540     if( h->mb.i_type == I_16x16 )
541     {
542         for( i = 0; i < 16; i++ )
543         {
544             int nz = array_non_zero( h->dct.luma4x4[i] );
545             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
546             h->mb.i_cbp_luma |= nz;
547         }
548         h->mb.i_cbp_luma *= 0xf;
549     }
550     else
551     {
552         for( i = 0; i < 4; i++)
553         {
554             if(!nnz8x8[i])
555                 for( j = 0; j < 4; j++ )
556                     h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = 0;
557             else if( h->mb.b_transform_8x8 )
558             {
559                 int nz = nnz8x8[i];
560                 for( j = 0; j < 4; j++ )
561                     h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz;
562                 h->mb.i_cbp_luma |= nz << i;
563             }
564             else
565             {
566                 for( j = 0; j < 4; j++ )
567                 {
568                     int nz = array_non_zero( h->dct.luma4x4[j+i*4] );
569                     h->mb.cache.non_zero_count[x264_scan8[j+i*4]] = nz;
570                     h->mb.i_cbp_luma |= nz << i;
571                 }
572             }
573         }
574     }
575
576     if( h->param.b_cabac )
577     {
578         i_cbp_dc = ( h->mb.i_type == I_16x16 && array_non_zero( h->dct.luma16x16_dc ) )
579                  | array_non_zero( h->dct.chroma_dc[0] ) << 1
580                  | array_non_zero( h->dct.chroma_dc[1] ) << 2;
581     }
582
583     /* store cbp */
584     h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
585
586     /* Check for P_SKIP
587      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
588      *      (if multiple mv give same result)*/
589     if( !b_force_no_skip )
590     {
591         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
592             h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma == 0x00 &&
593             h->mb.cache.mv[0][x264_scan8[0]][0] == h->mb.cache.pskip_mv[0] &&
594             h->mb.cache.mv[0][x264_scan8[0]][1] == h->mb.cache.pskip_mv[1] &&
595             h->mb.cache.ref[0][x264_scan8[0]] == 0 )
596         {
597             h->mb.i_type = P_SKIP;
598         }
599
600         /* Check for B_SKIP */
601         if( h->mb.i_type == B_DIRECT &&
602             h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
603         {
604             h->mb.i_type = B_SKIP;
605         }
606     }
607 }
608
609 /*****************************************************************************
610  * x264_macroblock_probe_skip:
611  *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
612  *  the previous QP
613  *****************************************************************************/
614 int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
615 {
616     DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
617     DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
618     DECLARE_ALIGNED_16( int16_t dctscan[16] );
619
620     int i_qp = h->mb.i_qp;
621     int mvp[2];
622     int ch;
623
624     int i8x8, i4x4;
625     int i_decimate_mb;
626
627     if( !b_bidir )
628     {
629         /* Get the MV */
630         mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
631         mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
632
633         /* Motion compensation */
634         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
635                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
636                        mvp[0], mvp[1], 16, 16 );
637     }
638
639     for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
640     {
641         int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
642         int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
643         /* get luma diff */
644         h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[0] + fenc_offset,
645                                     h->mb.pic.p_fdec[0] + fdec_offset );
646         /* encode one 4x4 block */
647         for( i4x4 = 0; i4x4 < 4; i4x4++ )
648         {
649             h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
650             if( !array_non_zero(dct4x4[i4x4]) )
651                 continue;
652             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
653             i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
654             if( i_decimate_mb >= 6 )
655                 return 0;
656         }
657     }
658
659     /* encode chroma */
660     i_qp = h->mb.i_chroma_qp;
661
662     for( ch = 0; ch < 2; ch++ )
663     {
664         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
665         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
666
667         if( !b_bidir )
668         {
669             h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch],       FDEC_STRIDE,
670                              h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
671                              mvp[0], mvp[1], 8, 8 );
672         }
673
674         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
675
676         /* calculate dct DC */
677         dct2x2[0][0] = dct4x4[0][0][0];
678         dct2x2[0][1] = dct4x4[1][0][0];
679         dct2x2[1][0] = dct4x4[2][0][0];
680         dct2x2[1][1] = dct4x4[3][0][0];
681         h->dctf.dct2x2dc( dct2x2 );
682         h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 );
683         if( array_non_zero(dct2x2) )
684             return 0;
685
686         /* calculate dct coeffs */
687         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
688         {
689             h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
690             if( !array_non_zero(dct4x4[i4x4]) )
691                 continue;
692             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
693             i_decimate_mb += x264_mb_decimate_score( dctscan+1, 15 );
694             if( i_decimate_mb >= 7 )
695                 return 0;
696         }
697     }
698
699     h->mb.b_skip_pbskip_mc = 1;
700     return 1;
701 }
702
703 /****************************************************************************
704  * DCT-domain noise reduction / adaptive deadzone
705  * from libavcodec
706  ****************************************************************************/
707
708 void x264_noise_reduction_update( x264_t *h )
709 {
710     int cat, i;
711     for( cat = 0; cat < 2; cat++ )
712     {
713         int size = cat ? 64 : 16;
714         const uint16_t *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
715
716         if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
717         {
718             for( i = 0; i < size; i++ )
719                 h->nr_residual_sum[cat][i] >>= 1;
720             h->nr_count[cat] >>= 1;
721         }
722
723         for( i = 0; i < size; i++ )
724             h->nr_offset[cat][i] =
725                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
726                  + h->nr_residual_sum[cat][i]/2)
727               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
728     }
729 }
730
731 void x264_denoise_dct( x264_t *h, int16_t *dct )
732 {
733     const int cat = h->mb.b_transform_8x8;
734     int i;
735
736     h->nr_count[cat]++;
737
738     for( i = (cat ? 63 : 15); i >= 1; i-- )
739     {
740         int level = dct[i];
741         if( level )
742         {
743             if( level > 0 )
744             {
745                 h->nr_residual_sum[cat][i] += level;
746                 level -= h->nr_offset[cat][i];
747                 if( level < 0 )
748                     level = 0;
749             }
750             else
751             {
752                 h->nr_residual_sum[cat][i] -= level;
753                 level += h->nr_offset[cat][i];
754                 if( level > 0 )
755                     level = 0;
756             }
757             dct[i] = level;
758         }
759     }
760 }
761
762 /*****************************************************************************
763  * RD only; 4 calls to this do not make up for one macroblock_encode.
764  * doesn't transform chroma dc.
765  *****************************************************************************/
766 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
767 {
768     int i_qp = h->mb.i_qp;
769     uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
770     uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
771     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
772     int nnz8x8;
773     int ch;
774
775     x264_mb_mc_8x8( h, i8 );
776
777     if( h->mb.b_transform_8x8 )
778     {
779         DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
780         h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
781         h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
782         h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
783
784         if( b_decimate )
785             nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
786         else
787             nnz8x8 = array_non_zero( dct8x8 );
788
789         if( nnz8x8 )
790         {
791             h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
792             h->dctf.add8x8_idct8( p_fdec, dct8x8 );
793         }
794     }
795     else
796     {
797         int i4;
798         DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
799         h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
800         h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
801         h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
802         h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
803         h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
804         for( i4 = 0; i4 < 4; i4++ )
805             h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
806
807         if( b_decimate )
808         {
809             int i_decimate_8x8 = 0;
810             for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
811                 i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[i8*4+i4], 16 );
812             nnz8x8 = 4 <= i_decimate_8x8;
813         }
814         else
815             nnz8x8 = array_non_zero( dct4x4 );
816
817         if( nnz8x8 )
818         {
819             for( i4 = 0; i4 < 4; i4++ )
820                 h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
821             h->dctf.add8x8_idct( p_fdec, dct4x4 );
822         }
823     }
824
825     i_qp = h->mb.i_chroma_qp;
826
827     for( ch = 0; ch < 2; ch++ )
828     {
829         DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
830         p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
831         p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
832
833         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
834         h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
835         h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
836         if( array_non_zero( dct4x4 ) )
837         {
838             h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
839             h->dctf.add4x4_idct( p_fdec, dct4x4 );
840         }
841     }
842
843     h->mb.i_cbp_luma &= ~(1 << i8);
844     h->mb.i_cbp_luma |= nnz8x8 << i8;
845     h->mb.i_cbp_chroma = 0x02;
846 }