]> git.sesse.net Git - x264/blob - encoder/macroblock.c
Try to guess input length for lavf input
[x264] / encoder / macroblock.c
1 /*****************************************************************************
2  * macroblock.c: h264 encoder library
3  *****************************************************************************
4  * Copyright (C) 2003-2008 x264 project
5  *
6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7  *          Loren Merritt <lorenm@u.washington.edu>
8  *          Fiona Glaser <fiona@x264.com>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23  *****************************************************************************/
24
25 #include "common/common.h"
26 #include "macroblock.h"
27
28 /* These chroma DC functions don't have assembly versions and are only used here. */
29
30 #define ZIG(i,y,x) level[i] = dct[x*2+y];
31 static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
32 {
33     ZIG(0,0,0)
34     ZIG(1,0,1)
35     ZIG(2,1,0)
36     ZIG(3,1,1)
37 }
38 #undef ZIG
39
40 #define IDCT_DEQUANT_START \
41     int d0 = dct[0] + dct[1]; \
42     int d1 = dct[2] + dct[3]; \
43     int d2 = dct[0] - dct[1]; \
44     int d3 = dct[2] - dct[3]; \
45     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
46
47 static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
48 {
49     IDCT_DEQUANT_START
50     dct4x4[0][0] = (d0 + d1) * dmf >> 5;
51     dct4x4[1][0] = (d0 - d1) * dmf >> 5;
52     dct4x4[2][0] = (d2 + d3) * dmf >> 5;
53     dct4x4[3][0] = (d2 - d3) * dmf >> 5;
54 }
55
56 static inline void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp )
57 {
58     IDCT_DEQUANT_START
59     out[0] = (d0 + d1) * dmf >> 5;
60     out[1] = (d0 - d1) * dmf >> 5;
61     out[2] = (d2 + d3) * dmf >> 5;
62     out[3] = (d2 - d3) * dmf >> 5;
63 }
64
65 static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
66 {
67     int d0 = dct4x4[0][0] + dct4x4[1][0];
68     int d1 = dct4x4[2][0] + dct4x4[3][0];
69     int d2 = dct4x4[0][0] - dct4x4[1][0];
70     int d3 = dct4x4[2][0] - dct4x4[3][0];
71     d[0] = d0 + d1;
72     d[2] = d2 + d3;
73     d[1] = d0 - d1;
74     d[3] = d2 - d3;
75     dct4x4[0][0] = 0;
76     dct4x4[1][0] = 0;
77     dct4x4[2][0] = 0;
78     dct4x4[3][0] = 0;
79 }
80
81 static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
82 {
83     int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
84     if( h->mb.b_trellis )
85         return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, 0, idx );
86     else
87         return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
88 }
89
90 static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int b_intra, int idx )
91 {
92     int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
93     if( h->mb.b_trellis )
94         return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
95     else
96         return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
97 }
98
99 /* All encoding functions must output the correct CBP and NNZ values.
100  * The entropy coding functions will check CBP first, then NNZ, before
101  * actually reading the DCT coefficients.  NNZ still must be correct even
102  * if CBP is zero because of the use of NNZ values for context selection.
103  * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
104  * that is only needed in CAVLC, and will be calculated by CAVLC's residual
105  * coding and stored as necessary. */
106
107 /* This means that decimation can be done merely by adjusting the CBP and NNZ
108  * rather than memsetting the coefficients. */
109
110 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
111 {
112     int nz;
113     pixel *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
114     pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
115     ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
116
117     if( h->mb.b_lossless )
118     {
119         nz = h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
120         h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
121         h->mb.i_cbp_luma |= nz<<(idx>>2);
122         return;
123     }
124
125     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
126
127     nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
128     h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
129     if( nz )
130     {
131         h->mb.i_cbp_luma |= 1<<(idx>>2);
132         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
133         h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
134         h->dctf.add4x4_idct( p_dst, dct4x4 );
135     }
136 }
137
138 #define STORE_8x8_NNZ( s8, nz )\
139 do\
140 {\
141     M16( &h->mb.cache.non_zero_count[(s8) + 0*8] ) = (nz) * 0x0101;\
142     M16( &h->mb.cache.non_zero_count[(s8) + 1*8] ) = (nz) * 0x0101;\
143 } while(0)
144
145 #define CLEAR_16x16_NNZ \
146 {\
147     M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\
148     M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\
149     M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\
150     M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\
151 }
152
153 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
154 {
155     int x = idx&1;
156     int y = idx>>1;
157     int s8 = X264_SCAN8_0 + 2*x + 16*y;
158     int nz;
159     pixel *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
160     pixel *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
161     ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
162
163     if( h->mb.b_lossless )
164     {
165         nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
166         STORE_8x8_NNZ( s8, nz );
167         h->mb.i_cbp_luma |= nz<<idx;
168         return;
169     }
170
171     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
172
173     nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
174     if( nz )
175     {
176         h->mb.i_cbp_luma |= 1<<idx;
177         h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
178         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
179         h->dctf.add8x8_idct8( p_dst, dct8x8 );
180         STORE_8x8_NNZ( s8, 1 );
181     }
182     else
183         STORE_8x8_NNZ( s8, 0 );
184 }
185
186 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
187 {
188     pixel *p_src = h->mb.pic.p_fenc[0];
189     pixel *p_dst = h->mb.pic.p_fdec[0];
190
191     ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
192     ALIGNED_ARRAY_16( dctcoef, dct_dc4x4,[16] );
193
194     int nz;
195     int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
196
197     if( h->mb.b_lossless )
198     {
199         for( int i = 0; i < 16; i++ )
200         {
201             int oe = block_idx_xy_fenc[i];
202             int od = block_idx_xy_fdec[i];
203             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
204             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
205             h->mb.i_cbp_luma |= nz;
206         }
207         h->mb.i_cbp_luma *= 0xf;
208         h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 );
209         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
210         return;
211     }
212
213     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
214
215     for( int i = 0; i < 16; i++ )
216     {
217         /* copy dc coeff */
218         dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
219         dct4x4[i][0] = 0;
220
221         /* quant/scan/dequant */
222         nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
223         h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
224         if( nz )
225         {
226             h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
227             h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
228             if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[i] );
229             h->mb.i_cbp_luma = 0xf;
230         }
231     }
232
233     /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
234     /* More useful with CAVLC, but still useful with CABAC. */
235     if( decimate_score < 6 )
236     {
237         h->mb.i_cbp_luma = 0;
238         CLEAR_16x16_NNZ
239     }
240
241     h->dctf.dct4x4dc( dct_dc4x4 );
242     if( h->mb.b_trellis )
243         nz = x264_quant_dc_trellis( h, dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1, 0 );
244     else
245         nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
246
247     h->mb.cache.non_zero_count[x264_scan8[24]] = nz;
248     if( nz )
249     {
250         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
251
252         /* output samples to fdec */
253         h->dctf.idct4x4dc( dct_dc4x4 );
254         h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
255         if( h->mb.i_cbp_luma )
256             for( int i = 0; i < 16; i++ )
257                 dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
258     }
259
260     /* put pixels to fdec */
261     if( h->mb.i_cbp_luma )
262         h->dctf.add16x16_idct( p_dst, dct4x4 );
263     else if( nz )
264         h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
265 }
266
267 static inline int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp )
268 {
269     dctcoef out[4];
270     idct_dequant_2x2_dconly( out, dct, dequant_mf, i_qp );
271     return ((ref[0] ^ (out[0]+32))
272           | (ref[1] ^ (out[1]+32))
273           | (ref[2] ^ (out[2]+32))
274           | (ref[3] ^ (out[3]+32))) >> 6;
275 }
276
277 /* Round down coefficients losslessly in DC-only chroma blocks.
278  * Unlike luma blocks, this can't be done with a lookup table or
279  * other shortcut technique because of the interdependencies
280  * between the coefficients due to the chroma DC transform. */
281 static inline int x264_mb_optimize_chroma_dc( x264_t *h, int b_inter, int i_qp, dctcoef dct2x2[4] )
282 {
283     dctcoef dct2x2_orig[4];
284     int coeff, nz;
285
286     /* If the QP is too high, there's no benefit to rounding optimization. */
287     if( h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << (i_qp/6) > 32*64 )
288         return 1;
289
290     idct_dequant_2x2_dconly( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
291     dct2x2_orig[0] += 32;
292     dct2x2_orig[1] += 32;
293     dct2x2_orig[2] += 32;
294     dct2x2_orig[3] += 32;
295
296     /* If the DC coefficients already round to zero, terminate early. */
297     if( !((dct2x2_orig[0]|dct2x2_orig[1]|dct2x2_orig[2]|dct2x2_orig[3])>>6) )
298         return 0;
299
300     /* Start with the highest frequency coefficient... is this the best option? */
301     for( nz = 0, coeff = h->quantf.coeff_last[DCT_CHROMA_DC]( dct2x2 ); coeff >= 0; coeff-- )
302     {
303         int level = dct2x2[coeff];
304         int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */
305
306         while( level )
307         {
308             dct2x2[coeff] = level - sign;
309             if( idct_dequant_round_2x2_dc( dct2x2_orig, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
310             {
311                 nz = 1;
312                 dct2x2[coeff] = level;
313                 break;
314             }
315             level -= sign;
316         }
317     }
318
319     return nz;
320 }
321
322 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
323 {
324     int nz, nz_dc;
325     int b_decimate = b_inter && h->mb.b_dct_decimate;
326     ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
327     h->mb.i_cbp_chroma = 0;
328
329     /* Early termination: check variance of chroma residual before encoding.
330      * Don't bother trying early termination at low QPs.
331      * Values are experimentally derived. */
332     if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
333     {
334         int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
335         int ssd[2];
336         int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
337         if( score < thresh*4 )
338             score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
339         if( score < thresh*4 )
340         {
341             h->mb.cache.non_zero_count[x264_scan8[16]] = 0;
342             h->mb.cache.non_zero_count[x264_scan8[17]] = 0;
343             h->mb.cache.non_zero_count[x264_scan8[18]] = 0;
344             h->mb.cache.non_zero_count[x264_scan8[19]] = 0;
345             h->mb.cache.non_zero_count[x264_scan8[20]] = 0;
346             h->mb.cache.non_zero_count[x264_scan8[21]] = 0;
347             h->mb.cache.non_zero_count[x264_scan8[22]] = 0;
348             h->mb.cache.non_zero_count[x264_scan8[23]] = 0;
349             M16( &h->mb.cache.non_zero_count[x264_scan8[25]] ) = 0;
350
351             for( int ch = 0; ch < 2; ch++ )
352             {
353                 if( ssd[ch] > thresh )
354                 {
355                     h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
356                     if( h->mb.b_trellis )
357                         nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
358                     else
359                         nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
360
361                     if( nz_dc )
362                     {
363                         if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
364                             continue;
365                         h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
366                         zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
367                         idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
368                         h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
369                         h->mb.i_cbp_chroma = 1;
370                     }
371                 }
372             }
373             return;
374         }
375     }
376
377     for( int ch = 0; ch < 2; ch++ )
378     {
379         pixel *p_src = h->mb.pic.p_fenc[1+ch];
380         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
381         int i_decimate_score = 0;
382         int nz_ac = 0;
383
384         ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
385
386         if( h->mb.b_lossless )
387         {
388             for( int i = 0; i < 4; i++ )
389             {
390                 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
391                 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
392                 nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
393                 h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
394                 h->mb.i_cbp_chroma |= nz;
395             }
396             h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( h->dct.chroma_dc[ch] );
397             continue;
398         }
399
400         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
401         dct2x2dc( dct2x2, dct4x4 );
402         /* calculate dct coeffs */
403         for( int i = 0; i < 4; i++ )
404         {
405             if( h->mb.b_trellis )
406                 nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
407             else
408                 nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
409             h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
410             if( nz )
411             {
412                 nz_ac = 1;
413                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
414                 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
415                 if( b_decimate )
416                     i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
417             }
418         }
419
420         if( h->mb.b_trellis )
421             nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1 );
422         else
423             nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
424
425         h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc;
426
427         if( (b_decimate && i_decimate_score < 7) || !nz_ac )
428         {
429             /* Decimate the block */
430             h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
431             h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
432             h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
433             h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
434             if( !nz_dc ) /* Whole block is empty */
435                 continue;
436             if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
437             {
438                 h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0;
439                 continue;
440             }
441             /* DC-only */
442             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
443             idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
444             h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
445         }
446         else
447         {
448             h->mb.i_cbp_chroma = 1;
449             if( nz_dc )
450             {
451                 zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
452                 idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
453             }
454             h->dctf.add8x8_idct( p_dst, dct4x4 );
455         }
456     }
457
458     /* 0 = none, 1 = DC only, 2 = DC+AC */
459     h->mb.i_cbp_chroma = ((!!M16( &h->mb.cache.non_zero_count[x264_scan8[25]] )) | h->mb.i_cbp_chroma) + h->mb.i_cbp_chroma;
460 }
461
462 static void x264_macroblock_encode_skip( x264_t *h )
463 {
464     M32( &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] ) = 0;
465     M32( &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] ) = 0;
466     M32( &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] ) = 0;
467     M32( &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] ) = 0;
468     for( int i = 16; i < 24; i++ )
469         h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
470     h->mb.i_cbp_luma = 0;
471     h->mb.i_cbp_chroma = 0;
472     h->mb.cbp[h->mb.i_mb_xy] = 0;
473 }
474
475 /*****************************************************************************
476  * x264_macroblock_encode_pskip:
477  *  Encode an already marked skip block
478  *****************************************************************************/
479 static void x264_macroblock_encode_pskip( x264_t *h )
480 {
481     /* don't do pskip motion compensation if it was already done in macroblock_analyse */
482     if( !h->mb.b_skip_mc )
483     {
484         int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
485                               h->mb.mv_min[0], h->mb.mv_max[0] );
486         int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
487                               h->mb.mv_min[1], h->mb.mv_max[1] );
488
489         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
490                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
491                        mvx, mvy, 16, 16, &h->sh.weight[0][0] );
492
493         /* Special case for mv0, which is (of course) very common in P-skip mode. */
494         if( mvx | mvy )
495             h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
496                              h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
497                              mvx, mvy, 8, 8 );
498         else
499             h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
500
501         if( h->sh.weight[0][1].weightfn )
502             h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
503                                                h->mb.pic.p_fdec[1], FDEC_STRIDE,
504                                                &h->sh.weight[0][1], 8 );
505         if( h->sh.weight[0][2].weightfn )
506             h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
507                                                h->mb.pic.p_fdec[2], FDEC_STRIDE,
508                                                &h->sh.weight[0][2], 8 );
509     }
510
511     x264_macroblock_encode_skip( h );
512 }
513
514 /*****************************************************************************
515  * Intra prediction for predictive lossless mode.
516  *****************************************************************************/
517
518 void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
519 {
520     if( i_mode == I_PRED_CHROMA_V )
521     {
522         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, 8 );
523         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, 8 );
524         memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
525         memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
526     }
527     else if( i_mode == I_PRED_CHROMA_H )
528     {
529         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, 8 );
530         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, 8 );
531         x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
532         x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
533     }
534     else
535     {
536         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
537         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
538     }
539 }
540
541 void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int idx, int i_mode )
542 {
543     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
544     pixel *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
545
546     if( i_mode == I_PRED_4x4_V )
547         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
548     else if( i_mode == I_PRED_4x4_H )
549         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
550     else
551         h->predict_4x4[i_mode]( p_dst );
552 }
553
554 void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int idx, int i_mode, pixel edge[33] )
555 {
556     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
557     pixel *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride;
558
559     if( i_mode == I_PRED_8x8_V )
560         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
561     else if( i_mode == I_PRED_8x8_H )
562         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
563     else
564         h->predict_8x8[i_mode]( p_dst, edge );
565 }
566
567 void x264_predict_lossless_16x16( x264_t *h, int i_mode )
568 {
569     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
570     if( i_mode == I_PRED_16x16_V )
571         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 );
572     else if( i_mode == I_PRED_16x16_H )
573         h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-1, stride, 16 );
574     else
575         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
576 }
577
578 /*****************************************************************************
579  * x264_macroblock_encode:
580  *****************************************************************************/
581 void x264_macroblock_encode( x264_t *h )
582 {
583     int i_qp = h->mb.i_qp;
584     int b_decimate = h->mb.b_dct_decimate;
585     int b_force_no_skip = 0;
586     int nz;
587     h->mb.i_cbp_luma = 0;
588     h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
589
590     if( h->mb.i_type == I_PCM )
591     {
592         /* if PCM is chosen, we need to store reconstructed frame data */
593         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 );
594         h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
595         h->mc.copy[PIXEL_8x8]  ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
596         return;
597     }
598
599     if( h->sh.b_mbaff
600         && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
601         && IS_SKIP(h->mb.type[h->sh.i_first_mb]) )
602     {
603         /* The first skip is predicted to be a frame mb pair.
604          * We don't yet support the aff part of mbaff, so force it to non-skip
605          * so that we can pick the aff flag. */
606         b_force_no_skip = 1;
607         if( IS_SKIP(h->mb.i_type) )
608         {
609             if( h->mb.i_type == P_SKIP )
610                 h->mb.i_type = P_L0;
611             else if( h->mb.i_type == B_SKIP )
612                 h->mb.i_type = B_DIRECT;
613         }
614     }
615
616     if( h->mb.i_type == P_SKIP )
617     {
618         /* A bit special */
619         x264_macroblock_encode_pskip( h );
620         return;
621     }
622     if( h->mb.i_type == B_SKIP )
623     {
624         /* don't do bskip motion compensation if it was already done in macroblock_analyse */
625         if( !h->mb.b_skip_mc )
626             x264_mb_mc( h );
627         x264_macroblock_encode_skip( h );
628         return;
629     }
630
631     if( h->mb.i_type == I_16x16 )
632     {
633         const int i_mode = h->mb.i_intra16x16_pred_mode;
634         h->mb.b_transform_8x8 = 0;
635
636         if( h->mb.b_lossless )
637             x264_predict_lossless_16x16( h, i_mode );
638         else
639             h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
640
641         /* encode the 16x16 macroblock */
642         x264_mb_encode_i16x16( h, i_qp );
643     }
644     else if( h->mb.i_type == I_8x8 )
645     {
646         ALIGNED_ARRAY_16( pixel, edge,[33] );
647         h->mb.b_transform_8x8 = 1;
648         /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
649         if( h->mb.i_skip_intra )
650         {
651             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
652             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
653             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
654             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
655             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
656             h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
657             /* In RD mode, restore the now-overwritten DCT data. */
658             if( h->mb.i_skip_intra == 2 )
659                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
660         }
661         for( int i = h->mb.i_skip_intra ? 3 : 0 ; i < 4; i++ )
662         {
663             pixel *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
664             int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
665             h->predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
666
667             if( h->mb.b_lossless )
668                 x264_predict_lossless_8x8( h, p_dst, i, i_mode, edge );
669             else
670                 h->predict_8x8[i_mode]( p_dst, edge );
671
672             x264_mb_encode_i8x8( h, i, i_qp );
673         }
674     }
675     else if( h->mb.i_type == I_4x4 )
676     {
677         h->mb.b_transform_8x8 = 0;
678         /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
679         if( h->mb.i_skip_intra )
680         {
681             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
682             M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
683             M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
684             M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
685             M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
686             h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
687             /* In RD mode, restore the now-overwritten DCT data. */
688             if( h->mb.i_skip_intra == 2 )
689                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
690         }
691         for( int i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
692         {
693             pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
694             int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
695
696             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
697                 /* emulate missing topright samples */
698                 MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
699
700             if( h->mb.b_lossless )
701                 x264_predict_lossless_4x4( h, p_dst, i, i_mode );
702             else
703                 h->predict_4x4[i_mode]( p_dst );
704             x264_mb_encode_i4x4( h, i, i_qp );
705         }
706     }
707     else    /* Inter MB */
708     {
709         int i_decimate_mb = 0;
710
711         /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
712         if( !h->mb.b_skip_mc )
713             x264_mb_mc( h );
714
715         if( h->mb.b_lossless )
716         {
717             if( h->mb.b_transform_8x8 )
718                 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
719                 {
720                     int x = i8x8&1;
721                     int y = i8x8>>1;
722                     int s8 = X264_SCAN8_0 + 2*x + 16*y;
723
724                     nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE,
725                                                                    h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE );
726                     STORE_8x8_NNZ( s8, nz );
727                     h->mb.i_cbp_luma |= nz << i8x8;
728                 }
729             else
730                 for( int i4x4 = 0; i4x4 < 16; i4x4++ )
731                 {
732                     nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
733                                         h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
734                                         h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
735                     h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
736                     h->mb.i_cbp_luma |= nz << (i4x4>>2);
737                 }
738         }
739         else if( h->mb.b_transform_8x8 )
740         {
741             ALIGNED_ARRAY_16( dctcoef, dct8x8,[4],[64] );
742             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
743             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
744             h->nr_count[1] += h->mb.b_noise_reduction * 4;
745
746             for( int idx = 0; idx < 4; idx++ )
747             {
748                 if( h->mb.b_noise_reduction )
749                     h->quantf.denoise_dct( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
750                 nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
751
752                 if( nz )
753                 {
754                     h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
755                     if( b_decimate )
756                     {
757                         int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
758                         i_decimate_mb += i_decimate_8x8;
759                         if( i_decimate_8x8 >= 4 )
760                             h->mb.i_cbp_luma |= 1<<idx;
761                     }
762                     else
763                         h->mb.i_cbp_luma |= 1<<idx;
764                 }
765             }
766
767             if( i_decimate_mb < 6 && b_decimate )
768             {
769                 h->mb.i_cbp_luma = 0;
770                 CLEAR_16x16_NNZ
771             }
772             else
773             {
774                 for( int idx = 0; idx < 4; idx++ )
775                 {
776                     int x = idx&1;
777                     int y = idx>>1;
778                     int s8 = X264_SCAN8_0 + 2*x + 16*y;
779
780                     if( h->mb.i_cbp_luma&(1<<idx) )
781                     {
782                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
783                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
784                         STORE_8x8_NNZ( s8, 1 );
785                     }
786                     else
787                         STORE_8x8_NNZ( s8, 0 );
788                 }
789             }
790         }
791         else
792         {
793             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16],[16] );
794             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
795             h->nr_count[0] += h->mb.b_noise_reduction * 16;
796
797             for( int i8x8 = 0; i8x8 < 4; i8x8++ )
798             {
799                 int i_decimate_8x8 = 0;
800                 int cbp = 0;
801
802                 /* encode one 4x4 block */
803                 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
804                 {
805                     int idx = i8x8 * 4 + i4x4;
806
807                     if( h->mb.b_noise_reduction )
808                         h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
809                     nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
810                     h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
811
812                     if( nz )
813                     {
814                         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
815                         h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
816                         if( b_decimate && i_decimate_8x8 < 6 )
817                             i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
818                         cbp = 1;
819                     }
820                 }
821
822                 int x = i8x8&1;
823                 int y = i8x8>>1;
824
825                 /* decimate this 8x8 block */
826                 i_decimate_mb += i_decimate_8x8;
827                 if( b_decimate )
828                 {
829                     if( i_decimate_8x8 < 4 )
830                     {
831                         int s8 = X264_SCAN8_0 + 2*x + 16*y;
832                         STORE_8x8_NNZ( s8, 0 );
833                     }
834                     else
835                         h->mb.i_cbp_luma |= 1<<i8x8;
836                 }
837                 else if( cbp )
838                 {
839                     h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], &dct4x4[i8x8*4] );
840                     h->mb.i_cbp_luma |= 1<<i8x8;
841                 }
842             }
843
844             if( b_decimate )
845             {
846                 if( i_decimate_mb < 6 )
847                 {
848                     h->mb.i_cbp_luma = 0;
849                     CLEAR_16x16_NNZ
850                 }
851                 else
852                 {
853                     for( int i8x8 = 0; i8x8 < 4; i8x8++ )
854                         if( h->mb.i_cbp_luma&(1<<i8x8) )
855                             h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
856                 }
857             }
858         }
859     }
860
861     /* encode chroma */
862     if( IS_INTRA( h->mb.i_type ) )
863     {
864         const int i_mode = h->mb.i_chroma_pred_mode;
865         if( h->mb.b_lossless )
866             x264_predict_lossless_8x8_chroma( h, i_mode );
867         else
868         {
869             h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
870             h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
871         }
872     }
873
874     /* encode the 8x8 blocks */
875     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
876
877     /* store cbp */
878     int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
879     if( h->param.b_cabac )
880         cbp |= h->mb.cache.non_zero_count[x264_scan8[24]] << 8
881             |  h->mb.cache.non_zero_count[x264_scan8[25]] << 9
882             |  h->mb.cache.non_zero_count[x264_scan8[26]] << 10;
883     h->mb.cbp[h->mb.i_mb_xy] = cbp;
884
885     /* Check for P_SKIP
886      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
887      *      (if multiple mv give same result)*/
888     if( !b_force_no_skip )
889     {
890         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
891             !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
892             M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
893             && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
894         {
895             h->mb.i_type = P_SKIP;
896         }
897
898         /* Check for B_SKIP */
899         if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
900         {
901             h->mb.i_type = B_SKIP;
902         }
903     }
904 }
905
906 /*****************************************************************************
907  * x264_macroblock_probe_skip:
908  *  Check if the current MB could be encoded as a [PB]_SKIP
909  *****************************************************************************/
910 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
911 {
912     ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
913     ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
914     ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
915     ALIGNED_4( int16_t mvp[2] );
916
917     int i_qp = h->mb.i_qp;
918     int thresh, ssd;
919
920     if( !b_bidir )
921     {
922         /* Get the MV */
923         mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
924         mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
925
926         /* Motion compensation */
927         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
928                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
929                        mvp[0], mvp[1], 16, 16, &h->sh.weight[0][0] );
930     }
931
932     for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
933     {
934         int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
935         int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
936         /* get luma diff */
937         h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[0] + fenc_offset,
938                                     h->mb.pic.p_fdec[0] + fdec_offset );
939         /* encode one 4x4 block */
940         for( int i4x4 = 0; i4x4 < 4; i4x4++ )
941         {
942             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) )
943                 continue;
944             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
945             i_decimate_mb += h->quantf.decimate_score16( dctscan );
946             if( i_decimate_mb >= 6 )
947                 return 0;
948         }
949     }
950
951     /* encode chroma */
952     i_qp = h->mb.i_chroma_qp;
953     thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
954
955     if( !b_bidir )
956     {
957         /* Special case for mv0, which is (of course) very common in P-skip mode. */
958         if( M32( mvp ) )
959             h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
960                              h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
961                              mvp[0], mvp[1], 8, 8 );
962         else
963             h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
964     }
965
966     for( int ch = 0; ch < 2; ch++ )
967     {
968         pixel *p_src = h->mb.pic.p_fenc[1+ch];
969         pixel *p_dst = h->mb.pic.p_fdec[1+ch];
970
971         if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
972             h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
973                                                   h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
974                                                   &h->sh.weight[0][1+ch], 8 );
975
976         /* there is almost never a termination during chroma, but we can't avoid the check entirely */
977         /* so instead we check SSD and skip the actual check if the score is low enough. */
978         ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
979         if( ssd < thresh )
980             continue;
981
982         /* The vast majority of chroma checks will terminate during the DC check or the higher
983          * threshold check, so we can save time by doing a DC-only DCT. */
984         h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
985
986         if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
987             return 0;
988
989         /* If there wasn't a termination in DC, we can check against a much higher threshold. */
990         if( ssd < thresh*4 )
991             continue;
992
993         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
994
995         /* calculate dct coeffs */
996         for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
997         {
998             dct4x4[i4x4][0] = 0;
999             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
1000                 continue;
1001             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
1002             i_decimate_mb += h->quantf.decimate_score15( dctscan );
1003             if( i_decimate_mb >= 7 )
1004                 return 0;
1005         }
1006     }
1007
1008     h->mb.b_skip_mc = 1;
1009     return 1;
1010 }
1011
1012 /****************************************************************************
1013  * DCT-domain noise reduction / adaptive deadzone
1014  * from libavcodec
1015  ****************************************************************************/
1016
1017 void x264_noise_reduction_update( x264_t *h )
1018 {
1019     for( int cat = 0; cat < 2; cat++ )
1020     {
1021         int size = cat ? 64 : 16;
1022         const uint16_t *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
1023
1024         if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
1025         {
1026             for( int i = 0; i < size; i++ )
1027                 h->nr_residual_sum[cat][i] >>= 1;
1028             h->nr_count[cat] >>= 1;
1029         }
1030
1031         for( int i = 0; i < size; i++ )
1032             h->nr_offset[cat][i] =
1033                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
1034                  + h->nr_residual_sum[cat][i]/2)
1035               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
1036     }
1037 }
1038
1039 /*****************************************************************************
1040  * RD only; 4 calls to this do not make up for one macroblock_encode.
1041  * doesn't transform chroma dc.
1042  *****************************************************************************/
1043 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
1044 {
1045     int i_qp = h->mb.i_qp;
1046     int x = i8&1;
1047     int y = i8>>1;
1048     int s8 = X264_SCAN8_0 + 2*x + 16*y;
1049     pixel *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
1050     pixel *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
1051     int b_decimate = h->mb.b_dct_decimate;
1052     int nnz8x8 = 0;
1053     int nz;
1054
1055     if( !h->mb.b_skip_mc )
1056         x264_mb_mc_8x8( h, i8 );
1057
1058     if( h->mb.b_lossless )
1059     {
1060         if( h->mb.b_transform_8x8 )
1061         {
1062             nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
1063             STORE_8x8_NNZ( s8, nnz8x8 );
1064         }
1065         else
1066         {
1067             for( int i4 = i8*4; i4 < i8*4+4; i4++ )
1068             {
1069                 nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
1070                                     h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
1071                                     h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
1072                 h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
1073                 nnz8x8 |= nz;
1074             }
1075         }
1076         for( int ch = 0; ch < 2; ch++ )
1077         {
1078             dctcoef dc;
1079             p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
1080             p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
1081             nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
1082             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
1083         }
1084     }
1085     else
1086     {
1087         if( h->mb.b_transform_8x8 )
1088         {
1089             ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
1090             h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
1091             nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
1092             if( nnz8x8 )
1093             {
1094                 h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
1095
1096                 if( b_decimate && !h->mb.b_trellis )
1097                     nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
1098
1099                 if( nnz8x8 )
1100                 {
1101                     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
1102                     h->dctf.add8x8_idct8( p_fdec, dct8x8 );
1103                     STORE_8x8_NNZ( s8, 1 );
1104                 }
1105                 else
1106                     STORE_8x8_NNZ( s8, 0 );
1107             }
1108             else
1109                 STORE_8x8_NNZ( s8, 0 );
1110         }
1111         else
1112         {
1113             int i_decimate_8x8 = 0;
1114             ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
1115             h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
1116             for( int i4 = 0; i4 < 4; i4++ )
1117             {
1118                 nz = x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
1119                 h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
1120                 if( nz )
1121                 {
1122                     h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
1123                     h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
1124                     if( b_decimate )
1125                         i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
1126                     nnz8x8 = 1;
1127                 }
1128             }
1129
1130             if( b_decimate && i_decimate_8x8 < 4 )
1131                 nnz8x8 = 0;
1132
1133             if( nnz8x8 )
1134                 h->dctf.add8x8_idct( p_fdec, dct4x4 );
1135             else
1136                 STORE_8x8_NNZ( s8, 0 );
1137         }
1138
1139         i_qp = h->mb.i_chroma_qp;
1140
1141         for( int ch = 0; ch < 2; ch++ )
1142         {
1143             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
1144             p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
1145             p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
1146
1147             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1148             dct4x4[0] = 0;
1149
1150             if( h->mb.b_trellis )
1151                 nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
1152             else
1153                 nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1154
1155             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
1156             if( nz )
1157             {
1158                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
1159                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
1160                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
1161             }
1162         }
1163     }
1164     h->mb.i_cbp_luma &= ~(1 << i8);
1165     h->mb.i_cbp_luma |= nnz8x8 << i8;
1166     h->mb.i_cbp_chroma = 0x02;
1167 }
1168
1169 /*****************************************************************************
1170  * RD only, luma only
1171  *****************************************************************************/
1172 void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
1173 {
1174     int i_qp = h->mb.i_qp;
1175     pixel *p_fenc = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[i4]];
1176     pixel *p_fdec = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
1177     int nz;
1178
1179     /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
1180
1181     if( h->mb.b_lossless )
1182     {
1183         nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
1184         h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
1185     }
1186     else
1187     {
1188         ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
1189         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1190         nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
1191         h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
1192         if( nz )
1193         {
1194             h->zigzagf.scan_4x4( h->dct.luma4x4[i4], dct4x4 );
1195             h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
1196             h->dctf.add4x4_idct( p_fdec, dct4x4 );
1197         }
1198     }
1199 }