]> git.sesse.net Git - x264/blob - encoder/macroblock.c
limit vertical motion vectors to +/-512, since some decoders actually depend on that...
[x264] / encoder / macroblock.c
1 /*****************************************************************************
2  * macroblock.c: h264 encoder library
3  *****************************************************************************
4  * Copyright (C) 2003 Laurent Aimar
5  * $Id: macroblock.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
6  *
7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
22  *****************************************************************************/
23
24 #include <stdio.h>
25 #include <string.h>
26
27 #include "common/common.h"
28 #include "macroblock.h"
29
30
31 #define ZIG(i,y,x) level[i] = dct[x][y];
32 static inline void zigzag_scan_2x2_dc( int level[4], int16_t dct[2][2] )
33 {
34     ZIG(0,0,0)
35     ZIG(1,0,1)
36     ZIG(2,1,0)
37     ZIG(3,1,1)
38 }
39 #undef ZIG
40
41 /* (ref: JVT-B118)
42  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
43  * to 0 (low score means set it to null)
44  * Used in inter macroblock (luma and chroma)
45  *  luma: for a 8x8 block: if score < 4 -> null
46  *        for the complete mb: if score < 6 -> null
47  *  chroma: for the complete mb: if score < 7 -> null
48  */
49 static int x264_mb_decimate_score( int *dct, int i_max )
50 {
51     static const int i_ds_table4[16] = {
52         3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
53     static const int i_ds_table8[64] = {
54         3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
55         1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
56         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
58
59     const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4;
60     int i_score = 0;
61     int idx = i_max - 1;
62
63     while( idx >= 0 && dct[idx] == 0 )
64         idx--;
65
66     while( idx >= 0 )
67     {
68         int i_run;
69
70         if( abs( dct[idx--] ) > 1 )
71             return 9;
72
73         i_run = 0;
74         while( idx >= 0 && dct[idx] == 0 )
75         {
76             idx--;
77             i_run++;
78         }
79         i_score += ds_table[i_run];
80     }
81
82     return i_score;
83 }
84
85 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale )
86 {
87     int x = 4 * block_idx_x[idx];
88     int y = 4 * block_idx_y[idx];
89     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
90     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
91     DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
92
93     if( h->mb.b_lossless )
94     {
95         h->zigzagf.sub_4x4( h->dct.block[idx].luma4x4, p_src, p_dst );
96         return;
97     }
98
99     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
100
101     if( h->mb.b_trellis )
102         x264_quant_4x4_trellis( h, dct4x4, CQM_4IY, i_qscale, DCT_LUMA_4x4, 1 );
103     else
104         h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
105
106     h->zigzagf.scan_4x4( h->dct.block[idx].luma4x4, dct4x4 );
107     h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qscale );
108
109     /* output samples to fdec */
110     h->dctf.add4x4_idct( p_dst, dct4x4 );
111 }
112
113 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale )
114 {
115     int x = 8 * (idx&1);
116     int y = 8 * (idx>>1);
117     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
118     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
119     DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
120
121     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
122
123     if( h->mb.b_trellis )
124         x264_quant_8x8_trellis( h, dct8x8, CQM_8IY, i_qscale, 1 );
125     else 
126         h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8IY][i_qscale], h->quant8_bias[CQM_8IY][i_qscale] );
127
128     h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
129     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qscale );
130     h->dctf.add8x8_idct8( p_dst, dct8x8 );
131 }
132
133 static void x264_mb_encode_i16x16( x264_t *h, int i_qscale )
134 {
135     uint8_t  *p_src = h->mb.pic.p_fenc[0];
136     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
137
138     DECLARE_ALIGNED( int16_t, dct4x4[16+1][4][4], 16 );
139
140     int i;
141
142     if( h->mb.b_lossless )
143     {
144         for( i = 0; i < 16; i++ )
145         {
146             int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
147             int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
148             h->zigzagf.sub_4x4ac( h->dct.block[i].residual_ac, p_src+oe, p_dst+od );
149             dct4x4[0][block_idx_x[i]][block_idx_y[i]] = p_src[oe] - p_dst[od];
150             p_dst[od] = p_src[oe];
151         }
152         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
153         return;
154     }
155
156     h->dctf.sub16x16_dct( &dct4x4[1], p_src, p_dst );
157     for( i = 0; i < 16; i++ )
158     {
159         /* copy dc coeff */
160         dct4x4[0][block_idx_y[i]][block_idx_x[i]] = dct4x4[1+i][0][0];
161
162         /* quant/scan/dequant */
163         if( h->mb.b_trellis )
164             x264_quant_4x4_trellis( h, dct4x4[1+i], CQM_4IY, i_qscale, DCT_LUMA_AC, 1 );
165         else
166             h->quantf.quant_4x4( dct4x4[1+i], h->quant4_mf[CQM_4IY][i_qscale], h->quant4_bias[CQM_4IY][i_qscale] );
167
168         h->zigzagf.scan_4x4ac( h->dct.block[i].residual_ac, dct4x4[1+i] );
169         h->quantf.dequant_4x4( dct4x4[1+i], h->dequant4_mf[CQM_4IY], i_qscale );
170     }
171
172     h->dctf.dct4x4dc( dct4x4[0] );
173     h->quantf.quant_4x4_dc( dct4x4[0], h->quant4_mf[CQM_4IY][i_qscale][0]>>1, h->quant4_bias[CQM_4IY][i_qscale][0]<<1 );
174     h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct4x4[0] );
175
176     /* output samples to fdec */
177     h->dctf.idct4x4dc( dct4x4[0] );
178     x264_mb_dequant_4x4_dc( dct4x4[0], h->dequant4_mf[CQM_4IY], i_qscale );  /* XXX not inversed */
179
180     /* calculate dct coeffs */
181     for( i = 0; i < 16; i++ )
182     {
183         /* copy dc coeff */
184         dct4x4[1+i][0][0] = dct4x4[0][block_idx_y[i]][block_idx_x[i]];
185     }
186     /* put pixels to fdec */
187     h->dctf.add16x16_idct( p_dst, &dct4x4[1] );
188 }
189
190 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale )
191 {
192     int i, ch;
193     int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
194
195     for( ch = 0; ch < 2; ch++ )
196     {
197         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
198         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
199         int i_decimate_score = 0;
200
201         DECLARE_ALIGNED( int16_t, dct2x2[2][2] , 16 );
202         DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
203
204         if( h->mb.b_lossless )
205         {
206             for( i = 0; i < 4; i++ )
207             {
208                 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
209                 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
210                 h->zigzagf.sub_4x4ac( h->dct.block[16+i+ch*4].residual_ac, p_src+oe, p_dst+od );
211                 h->dct.chroma_dc[ch][i] = p_src[oe] - p_dst[od];
212                 p_dst[od] = p_src[oe];
213             }
214             continue;
215         }
216             
217         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
218         /* calculate dct coeffs */
219         for( i = 0; i < 4; i++ )
220         {
221             /* copy dc coeff */
222             dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0];
223
224             /* no trellis; it doesn't seem to help chroma noticeably */
225             h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qscale], h->quant4_bias[CQM_4IC+b_inter][i_qscale] );
226             h->zigzagf.scan_4x4ac( h->dct.block[16+i+ch*4].residual_ac, dct4x4[i] );
227
228             if( b_decimate )
229             {
230                 i_decimate_score += x264_mb_decimate_score( h->dct.block[16+i+ch*4].residual_ac, 15 );
231             }
232         }
233
234         h->dctf.dct2x2dc( dct2x2 );
235         h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qscale][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qscale][0]<<1 );
236         zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
237
238         /* output samples to fdec */
239         h->dctf.idct2x2dc( dct2x2 );
240         x264_mb_dequant_2x2_dc( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qscale );  /* XXX not inversed */
241
242         if( b_decimate && i_decimate_score < 7 )
243         {
244             /* Near null chroma 8x8 block so make it null (bits saving) */
245             memset( &h->dct.block[16+ch*4], 0, 4 * sizeof( *h->dct.block ) );
246             if( !array_non_zero( dct2x2 ) )
247                 continue;
248             memset( dct4x4, 0, sizeof( dct4x4 ) );
249         }
250         else
251         {
252             for( i = 0; i < 4; i++ )
253                 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale );
254         }
255
256         for( i = 0; i < 4; i++ )
257             dct4x4[i][0][0] = dct2x2[0][i];
258         h->dctf.add8x8_idct( p_dst, dct4x4 );
259     }
260
261     /* coded block pattern */
262     h->mb.i_cbp_chroma = 0;
263     for( i = 0; i < 8; i++ )
264     {
265         int nz = array_non_zero_count( h->dct.block[16+i].residual_ac, 15 );
266         h->mb.cache.non_zero_count[x264_scan8[16+i]] = nz;
267         h->mb.i_cbp_chroma |= nz;
268     }
269     if( h->mb.i_cbp_chroma )
270         h->mb.i_cbp_chroma = 2;    /* dc+ac (we can't do only ac) */
271     else if( array_non_zero( h->dct.chroma_dc ) )
272         h->mb.i_cbp_chroma = 1;    /* dc only */
273 }
274
275 static void x264_macroblock_encode_skip( x264_t *h )
276 {
277     int i;
278     h->mb.i_cbp_luma = 0x00;
279     h->mb.i_cbp_chroma = 0x00;
280
281     for( i = 0; i < 16+8; i++ )
282     {
283         h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
284     }
285
286     /* store cbp */
287     h->mb.cbp[h->mb.i_mb_xy] = 0;
288 }
289
290 /*****************************************************************************
291  * x264_macroblock_encode_pskip:
292  *  Encode an already marked skip block
293  *****************************************************************************/
294 void x264_macroblock_encode_pskip( x264_t *h )
295 {
296     const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
297                                 h->mb.mv_min[0], h->mb.mv_max[0] );
298     const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
299                                 h->mb.mv_min[1], h->mb.mv_max[1] );
300
301     /* Motion compensation XXX probably unneeded */
302     h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
303                    h->mb.pic.p_fdec[0],    FDEC_STRIDE,
304                    mvx, mvy, 16, 16 );
305
306     /* Chroma MC */
307     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
308                      h->mb.pic.p_fdec[1],       FDEC_STRIDE,
309                      mvx, mvy, 8, 8 );
310
311     h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
312                      h->mb.pic.p_fdec[2],       FDEC_STRIDE,
313                      mvx, mvy, 8, 8 );
314
315     x264_macroblock_encode_skip( h );
316 }
317
318 /*****************************************************************************
319  * x264_macroblock_encode:
320  *****************************************************************************/
321 void x264_macroblock_encode( x264_t *h )
322 {
323     int i_cbp_dc = 0;
324     int i_qp = h->mb.i_qp;
325     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
326     int b_force_no_skip = 0;
327     int i;
328
329     if( h->sh.b_mbaff
330         && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
331         && IS_SKIP(h->mb.type[h->sh.i_first_mb]) )
332     {
333         /* The first skip is predicted to be a frame mb pair.
334          * We don't yet support the aff part of mbaff, so force it to non-skip
335          * so that we can pick the aff flag. */
336         b_force_no_skip = 1;
337         if( IS_SKIP(h->mb.i_type) )
338         {
339             if( h->mb.i_type == P_SKIP )
340                 h->mb.i_type = P_L0;
341             else if( h->mb.i_type == B_SKIP )
342                 h->mb.i_type = B_DIRECT;
343         }
344     }
345
346     if( h->mb.i_type == P_SKIP )
347     {
348         /* A bit special */
349         x264_macroblock_encode_pskip( h );
350         return;
351     }
352     if( h->mb.i_type == B_SKIP )
353     {
354         /* XXX motion compensation is probably unneeded */
355         x264_mb_mc( h );
356         x264_macroblock_encode_skip( h );
357         return;
358     }
359
360     if( h->mb.i_type == I_16x16 )
361     {
362         const int i_mode = h->mb.i_intra16x16_pred_mode;
363         h->mb.b_transform_8x8 = 0;
364         /* do the right prediction */
365         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
366
367         /* encode the 16x16 macroblock */
368         x264_mb_encode_i16x16( h, i_qp );
369     }
370     else if( h->mb.i_type == I_8x8 )
371     {
372         DECLARE_ALIGNED( uint8_t, edge[33], 8 );
373         h->mb.b_transform_8x8 = 1;
374         for( i = 0; i < 4; i++ )
375         {
376             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
377             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
378
379             x264_predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
380             h->predict_8x8[i_mode]( p_dst, edge );
381             x264_mb_encode_i8x8( h, i, i_qp );
382         }
383     }
384     else if( h->mb.i_type == I_4x4 )
385     {
386         h->mb.b_transform_8x8 = 0;
387         for( i = 0; i < 16; i++ )
388         {
389             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * FDEC_STRIDE];
390             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
391
392             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
393                 /* emulate missing topright samples */
394                 *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
395
396             h->predict_4x4[i_mode]( p_dst );
397             x264_mb_encode_i4x4( h, i, i_qp );
398         }
399     }
400     else    /* Inter MB */
401     {
402         int i8x8, i4x4, idx;
403         int i_decimate_mb = 0;
404
405         /* Motion compensation */
406         x264_mb_mc( h );
407
408         if( h->mb.b_lossless )
409         {
410             for( i4x4 = 0; i4x4 < 16; i4x4++ )
411             {
412                 int x = 4*block_idx_x[i4x4];
413                 int y = 4*block_idx_y[i4x4];
414                 h->zigzagf.sub_4x4( h->dct.block[i4x4].luma4x4,
415                                     h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
416                                     h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
417             }
418         }
419         else if( h->mb.b_transform_8x8 )
420         {
421             DECLARE_ALIGNED( int16_t, dct8x8[4][8][8], 16 );
422             int nnz8x8[4] = {1,1,1,1};
423             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
424             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
425
426             for( idx = 0; idx < 4; idx++ )
427             {
428                 if( h->mb.b_noise_reduction )
429                     x264_denoise_dct( h, (int16_t*)dct8x8[idx] );
430                 if( h->mb.b_trellis )
431                     x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
432                 else
433                     h->quantf.quant_8x8( dct8x8[idx], h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
434
435                 h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
436
437                 if( b_decimate )
438                 {
439                     int i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
440                     i_decimate_mb += i_decimate_8x8;
441                     if( i_decimate_8x8 < 4 )
442                     {
443                         memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) );
444                         memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) );
445                         nnz8x8[idx] = 0;
446                     }
447                 }
448                 else
449                     nnz8x8[idx] = array_non_zero( dct8x8[idx] );
450             }
451
452             if( i_decimate_mb < 6 && b_decimate )
453                 memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) );
454             else
455             {
456                 for( idx = 0; idx < 4; idx++ )
457                     if( nnz8x8[idx] )
458                     {
459                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
460                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
461                     }
462             }
463         }
464         else
465         {
466             DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
467             int nnz8x8[4] = {1,1,1,1};
468             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
469
470             for( i8x8 = 0; i8x8 < 4; i8x8++ )
471             {
472                 int i_decimate_8x8;
473
474                 /* encode one 4x4 block */
475                 i_decimate_8x8 = 0;
476                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
477                 {
478                     idx = i8x8 * 4 + i4x4;
479
480                     if( h->mb.b_noise_reduction )
481                         x264_denoise_dct( h, (int16_t*)dct4x4[idx] );
482                     if( h->mb.b_trellis )
483                         x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
484                     else
485                         h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
486
487                     h->zigzagf.scan_4x4( h->dct.block[idx].luma4x4, dct4x4[idx] );
488                     
489                     if( b_decimate )
490                         i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
491                 }
492
493                 /* decimate this 8x8 block */
494                 i_decimate_mb += i_decimate_8x8;
495                 if( i_decimate_8x8 < 4 && b_decimate )
496                 {
497                     memset( &dct4x4[i8x8*4], 0, 4 * sizeof( *dct4x4 ) );
498                     memset( &h->dct.block[i8x8*4], 0, 4 * sizeof( *h->dct.block ) );
499                     nnz8x8[i8x8] = 0;
500                 }
501             }
502
503             if( i_decimate_mb < 6 && b_decimate )
504                 memset( h->dct.block, 0, 16 * sizeof( *h->dct.block ) );
505             else
506             {
507                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
508                     if( nnz8x8[i8x8] )
509                     {
510                         for( i = 0; i < 4; i++ )
511                             h->quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp );
512                         h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
513                     }
514             }
515         }
516     }
517
518     /* encode chroma */
519     if( IS_INTRA( h->mb.i_type ) )
520     {
521         const int i_mode = h->mb.i_chroma_pred_mode;
522         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
523         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
524     }
525
526     /* encode the 8x8 blocks */
527     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
528
529     /* coded block pattern and non_zero_count */
530     h->mb.i_cbp_luma = 0x00;
531     if( h->mb.i_type == I_16x16 )
532     {
533         for( i = 0; i < 16; i++ )
534         {
535             const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 );
536             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
537             if( nz > 0 )
538                 h->mb.i_cbp_luma = 0x0f;
539         }
540     }
541     else if( h->mb.b_transform_8x8 )
542     {
543         /* coded_block_flag is enough for CABAC.
544          * the full non_zero_count is done only in CAVLC. */
545         for( i = 0; i < 4; i++ )
546         {
547             const int nz = array_non_zero( h->dct.luma8x8[i] );
548             int j;
549             for( j = 0; j < 4; j++ )
550                 h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz;
551             if( nz > 0 )
552                 h->mb.i_cbp_luma |= 1 << i;
553         }
554     }
555     else
556     {
557         for( i = 0; i < 16; i++ )
558         {
559             const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 );
560             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
561             if( nz > 0 )
562                 h->mb.i_cbp_luma |= 1 << (i/4);
563         }
564     }
565
566     if( h->param.b_cabac )
567     {
568         i_cbp_dc = ( h->mb.i_type == I_16x16 && array_non_zero( h->dct.luma16x16_dc ) )
569                  | array_non_zero( h->dct.chroma_dc[0] ) << 1
570                  | array_non_zero( h->dct.chroma_dc[1] ) << 2;
571     }
572
573     /* store cbp */
574     h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
575
576     /* Check for P_SKIP
577      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
578      *      (if multiple mv give same result)*/
579     if( !b_force_no_skip )
580     {
581         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
582             h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma == 0x00 &&
583             h->mb.cache.mv[0][x264_scan8[0]][0] == h->mb.cache.pskip_mv[0] &&
584             h->mb.cache.mv[0][x264_scan8[0]][1] == h->mb.cache.pskip_mv[1] &&
585             h->mb.cache.ref[0][x264_scan8[0]] == 0 )
586         {
587             h->mb.i_type = P_SKIP;
588         }
589
590         /* Check for B_SKIP */
591         if( h->mb.i_type == B_DIRECT &&
592             h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
593         {
594             h->mb.i_type = B_SKIP;
595         }
596     }
597 }
598
599 /*****************************************************************************
600  * x264_macroblock_probe_skip:
601  *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
602  *  the previous QP
603  *****************************************************************************/
604 int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
605 {
606     DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
607     DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
608     DECLARE_ALIGNED( int,     dctscan[16], 16 );
609
610     int i_qp = h->mb.i_qp;
611     int mvp[2];
612     int ch;
613
614     int i8x8, i4x4;
615     int i_decimate_mb;
616
617     if( !b_bidir )
618     {
619         /* Get the MV */
620         mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
621         mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
622
623         /* Motion compensation */
624         h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
625                        h->mb.pic.p_fdec[0],    FDEC_STRIDE,
626                        mvp[0], mvp[1], 16, 16 );
627     }
628
629     /* get luma diff */
630     h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0],
631                                   h->mb.pic.p_fdec[0] );
632
633     for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
634     {
635         /* encode one 4x4 block */
636         for( i4x4 = 0; i4x4 < 4; i4x4++ )
637         {
638             const int idx = i8x8 * 4 + i4x4;
639
640             h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
641             h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
642
643             i_decimate_mb += x264_mb_decimate_score( dctscan, 16 );
644
645             if( i_decimate_mb >= 6 )
646             {
647                 /* not as P_SKIP */
648                 return 0;
649             }
650         }
651     }
652
653     /* encode chroma */
654     i_qp = h->mb.i_chroma_qp;
655
656     for( ch = 0; ch < 2; ch++ )
657     {
658         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
659         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
660
661         if( !b_bidir )
662         {
663             h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
664                              h->mb.pic.p_fdec[1+ch],       FDEC_STRIDE,
665                              mvp[0], mvp[1], 8, 8 );
666         }
667
668         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
669
670         /* calculate dct DC */
671         dct2x2[0][0] = dct4x4[0][0][0];
672         dct2x2[0][1] = dct4x4[1][0][0];
673         dct2x2[1][0] = dct4x4[2][0][0];
674         dct2x2[1][1] = dct4x4[3][0][0];
675         h->dctf.dct2x2dc( dct2x2 );
676         h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 );
677         if( dct2x2[0][0] || dct2x2[0][1] || dct2x2[1][0] || dct2x2[1][1]  )
678         {
679             /* can't be */
680             return 0;
681         }
682
683         /* calculate dct coeffs */
684         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
685         {
686             h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
687             h->zigzagf.scan_4x4ac( dctscan, dct4x4[i4x4] );
688
689             i_decimate_mb += x264_mb_decimate_score( dctscan, 15 );
690             if( i_decimate_mb >= 7 )
691             {
692                 return 0;
693             }
694         }
695     }
696
697     return 1;
698 }
699
700 /****************************************************************************
701  * DCT-domain noise reduction / adaptive deadzone
702  * from libavcodec
703  ****************************************************************************/
704
705 void x264_noise_reduction_update( x264_t *h )
706 {
707     int cat, i;
708     for( cat = 0; cat < 2; cat++ )
709     {
710         int size = cat ? 64 : 16;
711         const int *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
712
713         if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
714         {
715             for( i = 0; i < size; i++ )
716                 h->nr_residual_sum[cat][i] >>= 1;
717             h->nr_count[cat] >>= 1;
718         }
719
720         for( i = 0; i < size; i++ )
721             h->nr_offset[cat][i] =
722                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
723                  + h->nr_residual_sum[cat][i]/2)
724               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
725     }
726 }
727
728 void x264_denoise_dct( x264_t *h, int16_t *dct )
729 {
730     const int cat = h->mb.b_transform_8x8;
731     int i;
732
733     h->nr_count[cat]++;
734
735     for( i = (cat ? 63 : 15); i >= 1; i-- )
736     {
737         int level = dct[i];
738         if( level )
739         {
740             if( level > 0 )
741             {
742                 h->nr_residual_sum[cat][i] += level;
743                 level -= h->nr_offset[cat][i];
744                 if( level < 0 )
745                     level = 0;
746             }
747             else
748             {
749                 h->nr_residual_sum[cat][i] -= level;
750                 level += h->nr_offset[cat][i];
751                 if( level > 0 )
752                     level = 0;
753             }
754             dct[i] = level;
755         }
756     }
757 }
758
759 /*****************************************************************************
760  * RD only; 4 calls to this do not make up for one macroblock_encode.
761  * doesn't transform chroma dc.
762  *****************************************************************************/
763 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
764 {
765     int i_qp = h->mb.i_qp;
766     uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
767     uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
768     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
769     int nnz8x8;
770     int ch;
771
772     x264_mb_mc_8x8( h, i8 );
773
774     if( h->mb.b_transform_8x8 )
775     {
776         DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
777         h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
778         h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] );
779         h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
780
781         if( b_decimate )
782             nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
783         else
784             nnz8x8 = array_non_zero( dct8x8 );
785
786         if( nnz8x8 )
787         {
788             h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
789             h->dctf.add8x8_idct8( p_fdec, dct8x8 );
790         }
791     }
792     else
793     {
794         int i4;
795         DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
796         h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
797         h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
798         h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
799         h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
800         h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] );
801         for( i4 = 0; i4 < 4; i4++ )
802             h->zigzagf.scan_4x4( h->dct.block[i8*4+i4].luma4x4, dct4x4[i4] );
803
804         if( b_decimate )
805         {
806             int i_decimate_8x8 = 0;
807             for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ )
808                 i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[i8*4+i4].luma4x4, 16 );
809             nnz8x8 = 4 <= i_decimate_8x8;
810         }
811         else
812             nnz8x8 = array_non_zero( dct4x4 );
813
814         if( nnz8x8 )
815         {
816             for( i4 = 0; i4 < 4; i4++ )
817                 h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
818             h->dctf.add8x8_idct( p_fdec, dct4x4 );
819         }
820     }
821
822     i_qp = h->mb.i_chroma_qp;
823
824     for( ch = 0; ch < 2; ch++ )
825     {
826         DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
827         p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
828         p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
829
830         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
831         h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
832         h->zigzagf.scan_4x4ac( h->dct.block[16+i8+ch*4].residual_ac, dct4x4 );
833         if( array_non_zero( dct4x4 ) )
834         {
835             h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
836             h->dctf.add4x4_idct( p_fdec, dct4x4 );
837         }
838     }
839
840     if( nnz8x8 )
841         h->mb.i_cbp_luma |= (1 << i8);
842     else
843         h->mb.i_cbp_luma &= ~(1 << i8);
844     h->mb.i_cbp_chroma = 0x02;
845 }