]> git.sesse.net Git - x264/blob - encoder/macroblock.c
Convert to a unified "dctcoeff" type for DCT data
[x264] / encoder / macroblock.c
1 /*****************************************************************************
2  * macroblock.c: h264 encoder library
3  *****************************************************************************
4  * Copyright (C) 2003-2008 x264 project
5  *
6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7  *          Loren Merritt <lorenm@u.washington.edu>
8  *          Fiona Glaser <fiona@x264.com>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23  *****************************************************************************/
24
25 #include "common/common.h"
26 #include "macroblock.h"
27
28 /* These chroma DC functions don't have assembly versions and are only used here. */
29
30 #define ZIG(i,y,x) level[i] = dct[x*2+y];
31 static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
32 {
33     ZIG(0,0,0)
34     ZIG(1,0,1)
35     ZIG(2,1,0)
36     ZIG(3,1,1)
37 }
38 #undef ZIG
39
40 #define IDCT_DEQUANT_START \
41     int d0 = dct[0] + dct[1]; \
42     int d1 = dct[2] + dct[3]; \
43     int d2 = dct[0] - dct[1]; \
44     int d3 = dct[2] - dct[3]; \
45     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
46
47 static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
48 {
49     IDCT_DEQUANT_START
50     dct4x4[0][0] = (d0 + d1) * dmf >> 5;
51     dct4x4[1][0] = (d0 - d1) * dmf >> 5;
52     dct4x4[2][0] = (d2 + d3) * dmf >> 5;
53     dct4x4[3][0] = (d2 - d3) * dmf >> 5;
54 }
55
56 static inline void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp )
57 {
58     IDCT_DEQUANT_START
59     out[0] = (d0 + d1) * dmf >> 5;
60     out[1] = (d0 - d1) * dmf >> 5;
61     out[2] = (d2 + d3) * dmf >> 5;
62     out[3] = (d2 - d3) * dmf >> 5;
63 }
64
65 static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
66 {
67     int d0 = dct4x4[0][0] + dct4x4[1][0];
68     int d1 = dct4x4[2][0] + dct4x4[3][0];
69     int d2 = dct4x4[0][0] - dct4x4[1][0];
70     int d3 = dct4x4[2][0] - dct4x4[3][0];
71     d[0] = d0 + d1;
72     d[2] = d2 + d3;
73     d[1] = d0 - d1;
74     d[3] = d2 - d3;
75     dct4x4[0][0] = 0;
76     dct4x4[1][0] = 0;
77     dct4x4[2][0] = 0;
78     dct4x4[3][0] = 0;
79 }
80
81 static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
82 {
83     int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
84     if( h->mb.b_trellis )
85         return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, 0, idx );
86     else
87         return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
88 }
89
90 static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int b_intra, int idx )
91 {
92     int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
93     if( h->mb.b_trellis )
94         return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
95     else
96         return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
97 }
98
99 /* All encoding functions must output the correct CBP and NNZ values.
100  * The entropy coding functions will check CBP first, then NNZ, before
101  * actually reading the DCT coefficients.  NNZ still must be correct even
102  * if CBP is zero because of the use of NNZ values for context selection.
103  * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
104  * that is only needed in CAVLC, and will be calculated by CAVLC's residual
105  * coding and stored as necessary. */
106
107 /* This means that decimation can be done merely by adjusting the CBP and NNZ
108  * rather than memsetting the coefficients. */
109
110 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
111 {
112     int nz;
113     pixel *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
114     pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
115     ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
116
117     if( h->mb.b_lossless )
118     {
119         nz = h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
120         h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
121         h->mb.i_cbp_luma |= nz<<(idx>>2);
122         return;
123     }
124
125     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
126
127     nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
128     h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
129     if( nz )
130     {
131         h->mb.i_cbp_luma |= 1<<(idx>>2);
132         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
133         h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
134         h->dctf.add4x4_idct( p_dst, dct4x4 );
135     }
136 }
137
138 #define STORE_8x8_NNZ( s8, nz )\
139 do\
140 {\
141     M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
142     M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
143 } while(0)
144
145 #define CLEAR_16x16_NNZ \
146 {\
147     M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\
148     M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\
149     M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\
150     M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\
151 }
152
153 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
154 {
155     int x = idx&1;
156     int y = idx>>1;
157     int s8 = X264_SCAN8_0 + 2*x + 16*y;
158     int nz;
159     pixel *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
160     pixel *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
161     ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
162
163     if( h->mb.b_lossless )
164     {
165         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
166         STORE_8x8_NNZ( s8, nz );
167         h->mb.i_cbp_luma |= nz<<idx;
168         return;
169     }
170
171     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
172
173     nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
174     if( nz )
175     {
176         h->mb.i_cbp_luma |= 1<<idx;
177         h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
178         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
179         h->dctf.add8x8_idct8( p_dst, dct8x8 );
180         STORE_8x8_NNZ( s8, 1 );
181     }
182     else
183         STORE_8x8_NNZ( s8, 0 );
184 }
185
186 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
187 {
188     pixel *p_src = h->mb.pic.p_fenc[0];
189     pixel *p_dst = h->mb.pic.p_fdec[0];
190
191     ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
192     ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );
193
194     int nz;
195     int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
196
197     if( h->mb.b_lossless )
198     {
199         for( int i = 0; i < 16; i++ )
200         {
201             int oe = block_idx_xy_fenc[i];
202             int od = block_idx_xy_fdec[i];
203             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
204             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
205             h->mb.i_cbp_luma |= nz;
206         }
207         h->mb.i_cbp_luma *= 0xf;
208         h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 );
209         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
210         return;
211     }
212
213     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
214
215     for( int i = 0; i < 16; i++ )
216     {
217         /* copy dc coeff */
218         dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
219         dct4x4[i][0] = 0;
220
221         /* quant/scan/dequant */
222         nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
223         h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
224         if( nz )
225         {
226             h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
227             h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
228             if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[i] );
229             h->mb.i_cbp_luma = 0xf;
230         }
231     }
232
233     /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
234     /* More useful with CAVLC, but still useful with CABAC. */
235     if( decimate_score < 6 )
236     {
237         h->mb.i_cbp_luma = 0;
238         CLEAR_16x16_NNZ
239     }
240
241     h->dctf.dct4x4dc( dct_dc4x4 );
242     if( h->mb.b_trellis )
243         nz = x264_quant_dc_trellis( h, dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1, 0 );
244     else
245         nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
246
247     h->mb.cache.non_zero_count[x264_scan8[24]] = nz;
248     if( nz )
249     {
250         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
251
252         /* output samples to fdec */
253         h->dctf.idct4x4dc( dct_dc4x4 );
254         h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
255         if( h->mb.i_cbp_luma )
256             for( int i = 0; i < 16; i++ )
257                 dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
258     }
259
260     /* put pixels to fdec */
261     if( h->mb.i_cbp_luma )
262         h->dctf.add16x16_idct( p_dst, dct4x4 );
263     else if( nz )
264         h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
265 }
266
267 static inline int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp )
268 {
269     dctcoef out[4];
270     idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp );
271     return ((ref[0] ^ (out[0]+32))
272           | (ref[1] ^ (out[1]+32))
273           | (ref[2] ^ (out[2]+32))
274           | (ref[3] ^ (out[3]+32))) >> 6;
275 }
276
277 /* Round down coefficients losslessly in DC-only chroma blocks.
278  * Unlike luma blocks, this can't be done with a lookup table or
279  * other shortcut technique because of the interdependencies
280  * between the coefficients due to the chroma DC transform. */
281 static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, dctcoef dct2x2[4] )
282 {
283     dctcoef dct2x2_orig[4];
284     int coeff, nz;
285
286     /* If the QP is too high, there's no benefit to rounding optimization. */
287     if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 32*64 )
288         return 1;
289
290     idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
291     dct2x2_orig[0] += 32;
292     dct2x2_orig[1] += 32;
293     dct2x2_orig[2] += 32;
294     dct2x2_orig[3] += 32;
295
296     /* If the DC coefficients already round to zero, terminate early. */
297     if( !((dct2x2_orig[0]|dct2x2_orig[1]|dct2x2_orig[2]|dct2x2_orig[3])>>6) )
298         return 0;
299
300     /* Start with the highest frequency coefficient... is this the best option? */
301     for( nz = 0, coeff = h->quantf.coeff_last[DCT_CHROMA_DC]( dct2x2 ); coeff >= 0; coeff-- )
302     {
303         int level = dct2x2[coeff];
304         int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */
305
306         while( level )
307         {
308             dct2x2[coeff] = level - sign;
309             if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
310             {
311                 nz = 1;
312                 dct2x2[coeff] = level;
313                 break;
314             }
315             level -= sign;
316         }
317     }
318
319     return nz;
320 }
321
322 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
323 {
324     int nz, nz_dc;
325     int b_decimate = b_inter && h->mb.b_dct_decimate;
326     ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
327     h->mb.i_cbp_chroma = 0;
328
329     /* Early termination: check variance of chroma residual before encoding.
330      * Don't bother trying early termination at low QPs.
331      * Values are experimentally derived. */
332     if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
333     {
334         int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
335         int ssd[2];
336         int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
337         if( score < thresh*4 )
338             score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
339         if( score < thresh*4 )
340         {
341             h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
342             h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
343             h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
344             h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
345             h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
346             h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
347             h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
348             h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
349             M16( &h->mb.cache.non_zero_count[x264_scan8[25]] ) = 0;
350
351             for( int ch = 0; ch < 2; ch++ )
352             {
353                 if( ssd[ch] > thresh )
354                 {
355                     h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
356                     if( h->mb.b_trellis )
357                         nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
358                     else
359                         nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
360
361                     if( nz_dc )
362                     {
363                         if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
364                             continue;
365                         h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
366                         zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
367                         idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
368                         h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
369                         h->mb.i_cbp_chroma = 1;
370                     }
371                 }
372             }
373             return;
374         }
375     }
376
377     for( int ch = 0; ch < 2; ch++ )
378     {
379         pixel *p_src = h->mb.pic.p_fenc[1+ch];
380         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
381         int i_decimate_score = 0;
382         int nz_ac = 0;
383
384         ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
385
386         if( h->mb.b_lossless )
387         {
388             for( int i = 0; i < 4; i++ )
389             {
390                 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
391                 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
392                 nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
393                 h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
394                 h->mb.i_cbp_chroma |= nz;
395             }
396             h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( h->dct.chroma_dc[ch] );
397             continue;
398         }
399
400         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
401         dct2x2dc( dct2x2, dct4x4 );
402         /* calculate dct coeffs */
403         for( int i = 0; i < 4; i++ )
404         {
405             if( h->mb.b_trellis )
406                 nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
407             else
408                 nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
409             h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
410             if( nz )
411             {
412                 nz_ac = 1;
413                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
414                 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
415                 if( b_decimate )
416                     i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
417             }
418         }
419
420         if( h->mb.b_trellis )
421             nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
422         else
423             nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
424
425         h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc;
426
427         if( (b_decimate && i_decimate_score < 7) || !nz_ac )
428         {
429             /* Decimate the block */
430             h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
431             h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
432             h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
433             h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
434             if( !nz_dc ) /* Whole block is empty */
435                 continue;
436             if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
437             {
438                 h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
439                 continue;
440             }
441             /* DC-only */
442             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
443             idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
444             h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
445         }
446         else
447         {
448             h->mb.i_cbp_chroma = 1;
449             if( nz_dc )
450             {
451                 zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
452                 idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
453             }
454             h->dctf.add8x8_idct( p_dst, dct4x4 );
455         }
456     }
457
458     /* 0 = none, 1 = DC only, 2 = DC+AC */
459     h->mb.i_cbp_chroma = ((!!M16( &h->mb.cache.non_zero_count[x264_scan8[25]] )) | h->mb.i_cbp_chroma) + h->mb.i_cbp_chroma;
460 }
461
462 static void x264_macroblock_encode_skip( x264_t *h )
463 {
464     M32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ) = 0;
465     M32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ) = 0;
466     M32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ) = 0;
467     M32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ) = 0;
468     for( int i = 16; i < 24; i++ )
469         h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
470     h->mb.i_cbp_luma = 0;
471     h->mb.i_cbp_chroma = 0;
472     h->mb.cbp[h->mb.i_mb_xy] = 0;
473 }
474
475 /*****************************************************************************
476  * x264_macroblock_encode_pskip:
477  *  Encode an already marked skip block
478  *****************************************************************************/
479 static void x264_macroblock_encode_pskip( x264_t *h )
480 {
481     /* don't do pskip motion compensation if it was already done in macroblock_analyse */
482     if( !h->mb.b_skip_mc )
483     {
484         int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
485                               h->mb.mv_min[0], h->mb.mv_max[0] );
486         int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
487                               h->mb.mv_min[1], h->mb.mv_max[1] );
488
489         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
490                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
491                        mvx, mvy, 16, 16, &h->sh.weight[0][0] );
492
493         /* Special case for mv0, which is (of course) very common in P-skip mode. */
494         if( mvx | mvy )
495         {
496             h->mc.mc_chroma( h->mb.pic.p_fdec[1],       FDEC_STRIDE,
497                              h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
498                              mvx, mvy, 8, 8 );
499             h->mc.mc_chroma( h->mb.pic.p_fdec[2],       FDEC_STRIDE,
500                              h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
501                              mvx, mvy, 8, 8 );
502         }
503         else
504         {
505             h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], 8 );
506             h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], 8 );
507         }
508
509         if( h->sh.weight[0][1].weightfn )
510             h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
511                                                h->mb.pic.p_fdec[1], FDEC_STRIDE,
512                                                &h->sh.weight[0][1], 8 );
513
514         if( h->sh.weight[0][2].weightfn )
515             h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
516                                                h->mb.pic.p_fdec[2], FDEC_STRIDE,
517                                                &h->sh.weight[0][2], 8 );
518     }
519
520     x264_macroblock_encode_skip( h );
521 }
522
523 /*****************************************************************************
524  * Intra prediction for predictive lossless mode.
525  *****************************************************************************/
526
527 /* Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes
528  * that the edge pixels of the reconstructed frame are the same as that of the source frame.  This means
529  * they will only work correctly if the neighboring blocks are losslessly coded.  In practice, this means
530  * lossless mode cannot be mixed with lossy mode within a frame. */
531 /* This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't
532  * need to be done unless we decide to allow mixing lossless and lossy compression. */
533
534 void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
535 {
536     int stride = h->fenc->i_stride[1] << h->mb.b_interlaced;
537     if( i_mode == I_PRED_CHROMA_V )
538     {
539         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 );
540         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 );
541     }
542     else if( i_mode == I_PRED_CHROMA_H )
543     {
544         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 );
545         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 );
546     }
547     else
548     {
549         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
550         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
551     }
552 }
553
554 void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int idx, int i_mode )
555 {
556     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
557     pixel *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
558
559     if( i_mode == I_PRED_4x4_V )
560         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
561     else if( i_mode == I_PRED_4x4_H )
562         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
563     else
564         h->predict_4x4[i_mode]( p_dst );
565 }
566
567 void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int idx, int i_mode, pixel edge[33] )
568 {
569     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
570     pixel *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride;
571
572     if( i_mode == I_PRED_8x8_V )
573         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
574     else if( i_mode == I_PRED_8x8_H )
575         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
576     else
577         h->predict_8x8[i_mode]( p_dst, edge );
578 }
579
580 void x264_predict_lossless_16x16( x264_t *h, int i_mode )
581 {
582     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
583     if( i_mode == I_PRED_16x16_V )
584         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 );
585     else if( i_mode == I_PRED_16x16_H )
586         h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-1, stride, 16 );
587     else
588         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
589 }
590
591 /*****************************************************************************
592  * x264_macroblock_encode:
593  *****************************************************************************/
594 void x264_macroblock_encode( x264_t *h )
595 {
596     int i_qp = h->mb.i_qp;
597     int b_decimate = h->mb.b_dct_decimate;
598     int b_force_no_skip = 0;
599     int nz;
600     h->mb.i_cbp_luma = 0;
601     h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
602
603     if( h->mb.i_type == I_PCM )
604     {
605         /* if PCM is chosen, we need to store reconstructed frame data */
606         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
607         h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
608         h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
609         return;
610     }
611
612     if( h->sh.b_mbaff
613         && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
614         && IS_SKIP(h->mb.type[h->sh.i_first_mb]) )
615     {
616         /* The first skip is predicted to be a frame mb pair.
617          * We don't yet support the aff part of mbaff, so force it to non-skip
618          * so that we can pick the aff flag. */
619         b_force_no_skip = 1;
620         if( IS_SKIP(h->mb.i_type) )
621         {
622             if( h->mb.i_type == P_SKIP )
623                 h->mb.i_type = P_L0;
624             else if( h->mb.i_type == B_SKIP )
625                 h->mb.i_type = B_DIRECT;
626         }
627     }
628
629     if( h->mb.i_type == P_SKIP )
630     {
631         /* A bit special */
632         x264_macroblock_encode_pskip( h );
633         return;
634     }
635     if( h->mb.i_type == B_SKIP )
636     {
637         /* don't do bskip motion compensation if it was already done in macroblock_analyse */
638         if( !h->mb.b_skip_mc )
639             x264_mb_mc( h );
640         x264_macroblock_encode_skip( h );
641         return;
642     }
643
644     if( h->mb.i_type == I_16x16 )
645     {
646         const int i_mode = h->mb.i_intra16x16_pred_mode;
647         h->mb.b_transform_8x8 = 0;
648
649         if( h->mb.b_lossless )
650             x264_predict_lossless_16x16( h, i_mode );
651         else
652             h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
653
654         /* encode the 16x16 macroblock */
655         x264_mb_encode_i16x16( h, i_qp );
656     }
657     else if( h->mb.i_type == I_8x8 )
658     {
659         ALIGNED_ARRAY_16( pixel, edge,[33] );
660         h->mb.b_transform_8x8 = 1;
661         /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
662         if( h->mb.i_skip_intra )
663         {
664             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
665             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
666             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
667             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
668             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
669             h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
670             /* In RD mode, restore the now-overwritten DCT data. */
671             if( h->mb.i_skip_intra == 2 )
672                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
673         }
674         for( int i = h->mb.i_skip_intra ? 3 : 0 ; i < 4; i++ )
675         {
676             pixel *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
677             int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
678             h->predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
679
680             if( h->mb.b_lossless )
681                 x264_predict_lossless_8x8( h, p_dst, i, i_mode, edge );
682             else
683                 h->predict_8x8[i_mode]( p_dst, edge );
684
685             x264_mb_encode_i8x8( h, i, i_qp );
686         }
687     }
688     else if( h->mb.i_type == I_4x4 )
689     {
690         h->mb.b_transform_8x8 = 0;
691         /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
692         if( h->mb.i_skip_intra )
693         {
694             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
695             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
696             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
697             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
698             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
699             h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
700             /* In RD mode, restore the now-overwritten DCT data. */
701             if( h->mb.i_skip_intra == 2 )
702                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
703         }
704         for( int i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
705         {
706             pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
707             int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
708
709             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
710                 /* emulate missing topright samples */
711                 MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
712
713             if( h->mb.b_lossless )
714                 x264_predict_lossless_4x4( h, p_dst, i, i_mode );
715             else
716                 h->predict_4x4[i_mode]( p_dst );
717             x264_mb_encode_i4x4( h, i, i_qp );
718         }
719     }
720     else    /* Inter MB */
721     {
722         int i_decimate_mb = 0;
723
724         /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
725         if( !h->mb.b_skip_mc )
726             x264_mb_mc( h );
727
728         if( h->mb.b_lossless )
729         {
730             if( h->mb.b_transform_8x8 )
731                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
732                 {
733                     int x = i8x8&1;
734                     int y = i8x8>>1;
735                     int s8 = X264_SCAN8_0 + 2*x + 16*y;
736
737                     nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
738                                                                    h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
739                     STORE_8x8_NNZ( s8, nz );
740                     h->mb.i_cbp_luma |= nz << i8x8;
741                 }
742             else
743                 for( int i4x4 = 0; i4x4 < 16; i4x4++ )
744                 {
745                     nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
746                                         h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
747                                         h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
748                     h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
749                     h->mb.i_cbp_luma |= nz << (i4x4>>2);
750                 }
751         }
752         else if( h->mb.b_transform_8x8 )
753         {
754             ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
755             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
756             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
757             h->nr_count[1] += h->mb.b_noise_reduction * 4;
758
759             for( int idx = 0; idx < 4; idx++ )
760             {
761                 if( h->mb.b_noise_reduction )
762                     h->quantf.denoise_dct( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
763                 nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
764
765                 if( nz )
766                 {
767                     h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
768                     if( b_decimate )
769                     {
770                         int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
771                         i_decimate_mb += i_decimate_8x8;
772                         if( i_decimate_8x8 >= 4 )
773                             h->mb.i_cbp_luma |= 1<<idx;
774                     }
775                     else
776                         h->mb.i_cbp_luma |= 1<<idx;
777                 }
778             }
779
780             if( i_decimate_mb < 6 && b_decimate )
781             {
782                 h->mb.i_cbp_luma = 0;
783                 CLEAR_16x16_NNZ
784             }
785             else
786             {
787                 for( int idx = 0; idx < 4; idx++ )
788                 {
789                     int x = idx&1;
790                     int y = idx>>1;
791                     int s8 = X264_SCAN8_0 + 2*x + 16*y;
792
793                     if( h->mb.i_cbp_luma&(1<<idx) )
794                     {
795                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
796                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
797                         STORE_8x8_NNZ( s8, 1 );
798                     }
799                     else
800                         STORE_8x8_NNZ( s8, 0 );
801                 }
802             }
803         }
804         else
805         {
806             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
807             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
808             h->nr_count[0] += h->mb.b_noise_reduction * 16;
809
810             for( int i8x8 = 0; i8x8 < 4; i8x8++ )
811             {
812                 int i_decimate_8x8 = 0;
813                 int cbp = 0;
814
815                 /* encode one 4x4 block */
816                 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
817                 {
818                     int idx = i8x8 * 4 + i4x4;
819
820                     if( h->mb.b_noise_reduction )
821                         h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
822                     nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
823                     h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
824
825                     if( nz )
826                     {
827                         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
828                         h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
829                         if( b_decimate && i_decimate_8x8 < 6 )
830                             i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
831                         cbp = 1;
832                     }
833                 }
834
835                 int x = i8x8&1;
836                 int y = i8x8>>1;
837
838                 /* decimate this 8x8 block */
839                 i_decimate_mb += i_decimate_8x8;
840                 if( b_decimate )
841                 {
842                     if( i_decimate_8x8 < 4 )
843                     {
844                         int s8 = X264_SCAN8_0 + 2*x + 16*y;
845                         STORE_8x8_NNZ( s8, 0 );
846                     }
847                     else
848                         h->mb.i_cbp_luma |= 1<<i8x8;
849                 }
850                 else if( cbp )
851                 {
852                     h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
853                     h->mb.i_cbp_luma |= 1<<i8x8;
854                 }
855             }
856
857             if( b_decimate )
858             {
859                 if( i_decimate_mb < 6 )
860                 {
861                     h->mb.i_cbp_luma = 0;
862                     CLEAR_16x16_NNZ
863                 }
864                 else
865                 {
866                     for( int i8x8 = 0; i8x8 < 4; i8x8++ )
867                         if( h->mb.i_cbp_luma&(1<<i8x8) )
868                             h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
869                 }
870             }
871         }
872     }
873
874     /* encode chroma */
875     if( IS_INTRA( h->mb.i_type ) )
876     {
877         const int i_mode = h->mb.i_chroma_pred_mode;
878         if( h->mb.b_lossless )
879             x264_predict_lossless_8x8_chroma( h, i_mode );
880         else
881         {
882             h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
883             h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
884         }
885     }
886
887     /* encode the 8x8 blocks */
888     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
889
890     /* store cbp */
891     int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
892     if( h->param.b_cabac )
893         cbp |= h->mb.cache.non_zero_count[x264_scan8[24]] << 8
894             |  h->mb.cache.non_zero_count[x264_scan8[25]] << 9
895             |  h->mb.cache.non_zero_count[x264_scan8[26]] << 10;
896     h->mb.cbp[h->mb.i_mb_xy] = cbp;
897
898     /* Check for P_SKIP
899      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
900      *      (if multiple mv give same result)*/
901     if( !b_force_no_skip )
902     {
903         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
904             !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
905             M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
906             && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
907         {
908             h->mb.i_type = P_SKIP;
909         }
910
911         /* Check for B_SKIP */
912         if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
913         {
914             h->mb.i_type = B_SKIP;
915         }
916     }
917 }
918
919 /*****************************************************************************
920  * x264_macroblock_probe_skip:
921  *  Check if the current MB could be encoded as a [PB]_SKIP
922  *****************************************************************************/
923 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
924 {
925     ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
926     ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
927     ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
928     ALIGNED_4( int16_t mvp[2] );
929
930     int i_qp = h->mb.i_qp;
931     int thresh, ssd;
932
933     if( !b_bidir )
934     {
935         /* Get the MV */
936         mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
937         mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
938
939         /* Motion compensation */
940         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
941                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
942                        mvp[0], mvp[1], 16, 16, &h->sh.weight[0][0] );
943     }
944
945     for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
946     {
947         int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
948         int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
949         /* get luma diff */
950         h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[0] + fenc_offset,
951                                     h->mb.pic.p_fdec[0] + fdec_offset );
952         /* encode one 4x4 block */
953         for( int i4x4 = 0; i4x4 < 4; i4x4++ )
954         {
955             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) )
956                 continue;
957             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
958             i_decimate_mb += h->quantf.decimate_score16( dctscan );
959             if( i_decimate_mb >= 6 )
960                 return 0;
961         }
962     }
963
964     /* encode chroma */
965     i_qp = h->mb.i_chroma_qp;
966     thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
967
968     for( int ch = 0; ch < 2; ch++ )
969     {
970         pixel *p_src = h->mb.pic.p_fenc[1+ch];
971         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
972
973         if( !b_bidir )
974         {
975             /* Special case for mv0, which is (of course) very common in P-skip mode. */
976             if( M32( mvp ) )
977             {
978                 h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch],       FDEC_STRIDE,
979                                  h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
980                                  mvp[0], mvp[1], 8, 8 );
981             }
982             else
983                 h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], 8 );
984
985             if( h->sh.weight[0][1+ch].weightfn )
986                 h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
987                                                       h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
988                                                       &h->sh.weight[0][1+ch], 8 );
989         }
990
991         /* there is almost never a termination during chroma, but we can't avoid the check entirely */
992         /* so instead we check SSD and skip the actual check if the score is low enough. */
993         ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
994         if( ssd < thresh )
995             continue;
996
997         /* The vast majority of chroma checks will terminate during the DC check or the higher
998          * threshold check, so we can save time by doing a DC-only DCT. */
999         h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
1000
1001         if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
1002             return 0;
1003
1004         /* If there wasn't a termination in DC, we can check against a much higher threshold. */
1005         if( ssd < thresh*4 )
1006             continue;
1007
1008         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
1009
1010         /* calculate dct coeffs */
1011         for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
1012         {
1013             dct4x4[i4x4][0] = 0;
1014             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
1015                 continue;
1016             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
1017             i_decimate_mb += h->quantf.decimate_score15( dctscan );
1018             if( i_decimate_mb >= 7 )
1019                 return 0;
1020         }
1021     }
1022
1023     h->mb.b_skip_mc = 1;
1024     return 1;
1025 }
1026
1027 /****************************************************************************
1028  * DCT-domain noise reduction / adaptive deadzone
1029  * from libavcodec
1030  ****************************************************************************/
1031
1032 void x264_noise_reduction_update( x264_t *h )
1033 {
1034     for( int cat = 0; cat < 2; cat++ )
1035     {
1036         int size = cat ? 64 : 16;
1037         const uint16_t *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
1038
1039         if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
1040         {
1041             for( int i = 0; i < size; i++ )
1042                 h->nr_residual_sum[cat][i] >>= 1;
1043             h->nr_count[cat] >>= 1;
1044         }
1045
1046         for( int i = 0; i < size; i++ )
1047             h->nr_offset[cat][i] =
1048                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
1049                  + h->nr_residual_sum[cat][i]/2)
1050               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
1051     }
1052 }
1053
1054 /*****************************************************************************
1055  * RD only; 4 calls to this do not make up for one macroblock_encode.
1056  * doesn't transform chroma dc.
1057  *****************************************************************************/
1058 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1059 {
1060     int i_qp = h->mb.i_qp;
1061     int x = i8&1;
1062     int y = i8>>1;
1063     int s8 = X264_SCAN8_0 + 2*x + 16*y;
1064     pixel *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
1065     pixel *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
1066     int b_decimate = h->mb.b_dct_decimate;
1067     int nnz8x8 = 0;
1068     int nz;
1069
1070     if( !h->mb.b_skip_mc )
1071         x264_mb_mc_8x8( h, i8 );
1072
1073     if( h->mb.b_lossless )
1074     {
1075         if( h->mb.b_transform_8x8 )
1076         {
1077             nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
1078             STORE_8x8_NNZ( s8, nnz8x8 );
1079         }
1080         else
1081         {
1082             for( int i4 = i8*4; i4 < i8*4+4; i4++ )
1083             {
1084                 nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
1085                                     h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
1086                                     h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
1087                 h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
1088                 nnz8x8 |= nz;
1089             }
1090         }
1091         for( int ch = 0; ch < 2; ch++ )
1092         {
1093             dctcoef dc;
1094             p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
1095             p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
1096             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
1097             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
1098         }
1099     }
1100     else
1101     {
1102         if( h->mb.b_transform_8x8 )
1103         {
1104             ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
1105             h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
1106             nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
1107             if( nnz8x8 )
1108             {
1109                 h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
1110
1111                 if( b_decimate && !h->mb.b_trellis )
1112                     nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
1113
1114                 if( nnz8x8 )
1115                 {
1116                     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
1117                     h->dctf.add8x8_idct8( p_fdec, dct8x8 );
1118                     STORE_8x8_NNZ( s8, 1 );
1119                 }
1120                 else
1121                     STORE_8x8_NNZ( s8, 0 );
1122             }
1123             else
1124                 STORE_8x8_NNZ( s8, 0 );
1125         }
1126         else
1127         {
1128             int i_decimate_8x8 = 0;
1129             ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
1130             h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
1131             for( int i4 = 0; i4 < 4; i4++ )
1132             {
1133                 nz = x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
1134                 h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
1135                 if( nz )
1136                 {
1137                     h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
1138                     h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
1139                     if( b_decimate )
1140                         i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
1141                     nnz8x8 = 1;
1142                 }
1143             }
1144
1145             if( b_decimate && i_decimate_8x8 < 4 )
1146                 nnz8x8 = 0;
1147
1148             if( nnz8x8 )
1149                 h->dctf.add8x8_idct( p_fdec, dct4x4 );
1150             else
1151                 STORE_8x8_NNZ( s8, 0 );
1152         }
1153
1154         i_qp = h->mb.i_chroma_qp;
1155
1156         for( int ch = 0; ch < 2; ch++ )
1157         {
1158             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
1159             p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
1160             p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
1161
1162             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1163             dct4x4[0] = 0;
1164
1165             if( h->mb.b_trellis )
1166                 nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
1167             else
1168                 nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1169
1170             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
1171             if( nz )
1172             {
1173                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
1174                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
1175                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
1176             }
1177         }
1178     }
1179     h->mb.i_cbp_luma &= ~(1 << i8);
1180     h->mb.i_cbp_luma |= nnz8x8 << i8;
1181     h->mb.i_cbp_chroma = 0x02;
1182 }
1183
1184 /*****************************************************************************
1185  * RD only, luma only
1186  *****************************************************************************/
1187 void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
1188 {
1189     int i_qp = h->mb.i_qp;
1190     pixel *p_fenc = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[i4]];
1191     pixel *p_fdec = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
1192     int nz;
1193
1194     /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
1195
1196     if( h->mb.b_lossless )
1197     {
1198         nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
1199         h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
1200     }
1201     else
1202     {
1203         ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
1204         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1205         nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
1206         h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
1207         if( nz )
1208         {
1209             h->zigzagf.scan_4x4( h->dct.luma4x4[i4], dct4x4 );
1210             h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
1211             h->dctf.add4x4_idct( p_fdec, dct4x4 );
1212         }
1213     }
1214 }