]> git.sesse.net Git - x264/blob - encoder/macroblock.c
CAVLC optimizations
[x264] / encoder / macroblock.c
1 /*****************************************************************************
2  * macroblock.c: h264 encoder library
3  *****************************************************************************
4  * Copyright (C) 2003-2008 x264 project
5  *
6  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7  *          Loren Merritt <lorenm@u.washington.edu>
8  *          Fiona Glaser <fiona@x264.com>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23  *****************************************************************************/
24
25 #include "common/common.h"
26 #include "macroblock.h"
27
28 /* These chroma DC functions don't have assembly versions and are only used here. */
29
30 #define ZIG(i,y,x) level[i] = dct[x][y];
31 static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
32 {
33     ZIG(0,0,0)
34     ZIG(1,0,1)
35     ZIG(2,1,0)
36     ZIG(3,1,1)
37 }
38 #undef ZIG
39
40 #define IDCT_DEQUANT_START \
41     int d0 = dct[0][0] + dct[0][1]; \
42     int d1 = dct[1][0] + dct[1][1]; \
43     int d2 = dct[0][0] - dct[0][1]; \
44     int d3 = dct[1][0] - dct[1][1]; \
45     int dmf = dequant_mf[i_qp%6][0][0]; \
46     int qbits = i_qp/6 - 5; \
47     if( qbits > 0 ) \
48     { \
49         dmf <<= qbits; \
50         qbits = 0; \
51     }
52
53 static inline void idct_dequant_2x2_dc( int16_t dct[2][2], int16_t dct4x4[4][4][4], int dequant_mf[6][4][4], int i_qp )
54 {
55     IDCT_DEQUANT_START
56     dct4x4[0][0][0] = (d0 + d1) * dmf >> -qbits;
57     dct4x4[1][0][0] = (d0 - d1) * dmf >> -qbits;
58     dct4x4[2][0][0] = (d2 + d3) * dmf >> -qbits;
59     dct4x4[3][0][0] = (d2 - d3) * dmf >> -qbits;
60 }
61
62 static inline void idct_dequant_2x2_dconly( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
63 {
64     IDCT_DEQUANT_START
65     dct[0][0] = (d0 + d1) * dmf >> -qbits;
66     dct[0][1] = (d0 - d1) * dmf >> -qbits;
67     dct[1][0] = (d2 + d3) * dmf >> -qbits;
68     dct[1][1] = (d2 - d3) * dmf >> -qbits;
69 }
70
71 static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] )
72 {
73     int d0 = dct4x4[0][0][0] + dct4x4[1][0][0];
74     int d1 = dct4x4[2][0][0] + dct4x4[3][0][0];
75     int d2 = dct4x4[0][0][0] - dct4x4[1][0][0];
76     int d3 = dct4x4[2][0][0] - dct4x4[3][0][0];
77     d[0][0] = d0 + d1;
78     d[1][0] = d2 + d3;
79     d[0][1] = d0 - d1;
80     d[1][1] = d2 - d3;
81     dct4x4[0][0][0] = 0;
82     dct4x4[1][0][0] = 0;
83     dct4x4[2][0][0] = 0;
84     dct4x4[3][0][0] = 0;
85 }
86
87 static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx )
88 {
89     int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
90     if( h->mb.b_trellis )
91         return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx );
92     else
93         return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
94 }
95
96 static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx )
97 {
98     int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
99     if( h->mb.b_trellis )
100         return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
101     else
102         return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] );
103 }
104
105 /* All encoding functions must output the correct CBP and NNZ values.
106  * The entropy coding functions will check CBP first, then NNZ, before
107  * actually reading the DCT coefficients.  NNZ still must be correct even
108  * if CBP is zero because of the use of NNZ values for context selection.
109  * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
110  * that is only needed in CAVLC, and will be calculated by CAVLC's residual
111  * coding and stored as necessary. */
112
113 /* This means that decimation can be done merely by adjusting the CBP and NNZ
114  * rather than memsetting the coefficients. */
115
116 void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
117 {
118     int nz;
119     uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
120     uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
121     DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
122
123     if( h->mb.b_lossless )
124     {
125         h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
126         nz = array_non_zero( h->dct.luma4x4[idx] );
127         h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
128         h->mb.i_cbp_luma |= nz<<(idx>>2);
129         return;
130     }
131
132     h->dctf.sub4x4_dct( dct4x4, p_src, p_dst );
133
134     nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx );
135     h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
136     if( nz )
137     {
138         h->mb.i_cbp_luma |= 1<<(idx>>2);
139         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
140         h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
141         h->dctf.add4x4_idct( p_dst, dct4x4 );
142     }
143 }
144
145 #define STORE_8x8_NNZ(idx,nz)\
146 {\
147     *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\
148     *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\
149 }
150
151 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
152 {
153     int x = 8 * (idx&1);
154     int y = 8 * (idx>>1);
155     int nz;
156     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
157     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
158     DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
159
160     if( h->mb.b_lossless )
161     {
162         h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
163         nz = array_non_zero( h->dct.luma8x8[idx] );
164         STORE_8x8_NNZ(idx,nz);
165         h->mb.i_cbp_luma |= nz<<idx;
166         return;
167     }
168
169     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
170
171     nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx );
172     if( nz )
173     {
174         h->mb.i_cbp_luma |= 1<<idx;
175         h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
176         h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
177         h->dctf.add8x8_idct8( p_dst, dct8x8 );
178         STORE_8x8_NNZ(idx,1);
179     }
180     else
181         STORE_8x8_NNZ(idx,0);
182 }
183
184 static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
185 {
186     uint8_t  *p_src = h->mb.pic.p_fenc[0];
187     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
188
189     DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
190     DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
191
192     int i, nz;
193     int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
194     int decimate_score = b_decimate ? 0 : 9;
195
196     if( h->mb.b_lossless )
197     {
198         for( i = 0; i < 16; i++ )
199         {
200             int oe = block_idx_xy_fenc[i];
201             int od = block_idx_xy_fdec[i];
202             h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od );
203             dct_dc4x4[0][block_idx_yx_1d[i]] = h->dct.luma4x4[i][0];
204             h->dct.luma4x4[i][0] = 0;
205             nz = array_non_zero( h->dct.luma4x4[i] );
206             h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
207             h->mb.i_cbp_luma |= nz;
208         }
209         h->mb.i_cbp_luma *= 0xf;
210         h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 );
211         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
212         return;
213     }
214
215     h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
216
217     for( i = 0; i < 16; i++ )
218     {
219         /* copy dc coeff */
220         dct_dc4x4[0][block_idx_xy_1d[i]] = dct4x4[i][0][0];
221         dct4x4[i][0][0] = 0;
222
223         /* quant/scan/dequant */
224         nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i );
225         h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
226         if( nz )
227         {
228             h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
229             h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
230             if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[i] );
231             h->mb.i_cbp_luma = 0xf;
232         }
233     }
234
235     /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
236     /* More useful with CAVLC, but still useful with CABAC. */
237     if( decimate_score < 6 )
238     {
239         h->mb.i_cbp_luma = 0;
240         *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
241         *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
242         *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
243         *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
244     }
245
246     h->dctf.dct4x4dc( dct_dc4x4 );
247     if( h->mb.b_trellis )
248         nz = x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1);
249     else
250         nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 );
251
252     h->mb.cache.non_zero_count[x264_scan8[24]] = nz;
253     if( nz )
254     {
255         h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 );
256
257         /* output samples to fdec */
258         h->dctf.idct4x4dc( dct_dc4x4 );
259         h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
260         if( h->mb.i_cbp_luma )
261             for( i = 0; i < 16; i++ )
262                 dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]];
263     }
264
265     /* put pixels to fdec */
266     if( h->mb.i_cbp_luma )
267         h->dctf.add16x16_idct( p_dst, dct4x4 );
268     else if( nz )
269         h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
270 }
271
272 void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
273 {
274     int i, ch, nz, nz_dc;
275     int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
276     h->mb.i_cbp_chroma = 0;
277
278     for( ch = 0; ch < 2; ch++ )
279     {
280         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
281         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
282         int i_decimate_score = 0;
283         int nz_ac = 0;
284
285         DECLARE_ALIGNED_16( int16_t dct2x2[2][2]  );
286         DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
287
288         if( h->mb.b_lossless )
289         {
290             for( i = 0; i < 4; i++ )
291             {
292                 int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
293                 int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
294                 h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od );
295                 h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0];
296                 h->dct.luma4x4[16+i+ch*4][0] = 0;
297                 nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
298                 h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
299                 h->mb.i_cbp_chroma |= nz;
300             }
301             h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( h->dct.chroma_dc[ch] );
302             continue;
303         }
304
305         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
306         dct2x2dc( dct2x2, dct4x4 );
307         /* calculate dct coeffs */
308         for( i = 0; i < 4; i++ )
309         {
310             if( h->mb.b_trellis )
311                 nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 );
312             else
313                 nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
314             h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
315             if( nz )
316             {
317                 nz_ac = 1;
318                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
319                 h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
320                 if( b_decimate )
321                     i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
322             }
323         }
324
325         if( h->mb.b_trellis )
326             nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter );
327         else
328             nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
329
330         h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc;
331
332         if( (b_decimate && i_decimate_score < 7) || !nz_ac )
333         {
334             /* Decimate the block */
335             h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0;
336             h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0;
337             h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0;
338             h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0;
339             if( !nz_dc ) /* Whole block is empty */
340                 continue;
341             /* DC-only */
342             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
343             idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
344             h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
345         }
346         else
347         {
348             h->mb.i_cbp_chroma = 1;
349             if( nz_dc )
350             {
351                 zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
352                 idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
353             }
354             h->dctf.add8x8_idct( p_dst, dct4x4 );
355         }
356     }
357
358     if( h->mb.i_cbp_chroma )
359         h->mb.i_cbp_chroma = 2;    /* dc+ac (we can't do only ac) */
360     else if( h->mb.cache.non_zero_count[x264_scan8[25]] |
361              h->mb.cache.non_zero_count[x264_scan8[26]] )
362         h->mb.i_cbp_chroma = 1;    /* dc only */
363 }
364
365 static void x264_macroblock_encode_skip( x264_t *h )
366 {
367     h->mb.i_cbp_luma = 0x00;
368     h->mb.i_cbp_chroma = 0x00;
369     memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE );
370     /* store cbp */
371     h->mb.cbp[h->mb.i_mb_xy] = 0;
372 }
373
374 /*****************************************************************************
375  * x264_macroblock_encode_pskip:
376  *  Encode an already marked skip block
377  *****************************************************************************/
378 static void x264_macroblock_encode_pskip( x264_t *h )
379 {
380     const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
381                                 h->mb.mv_min[0], h->mb.mv_max[0] );
382     const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
383                                 h->mb.mv_min[1], h->mb.mv_max[1] );
384
385     /* don't do pskip motion compensation if it was already done in macroblock_analyse */
386     if( !h->mb.b_skip_mc )
387     {
388         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
389                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
390                        mvx, mvy, 16, 16 );
391
392         h->mc.mc_chroma( h->mb.pic.p_fdec[1],       FDEC_STRIDE,
393                          h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
394                          mvx, mvy, 8, 8 );
395
396         h->mc.mc_chroma( h->mb.pic.p_fdec[2],       FDEC_STRIDE,
397                          h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
398                          mvx, mvy, 8, 8 );
399     }
400
401     x264_macroblock_encode_skip( h );
402 }
403
404 /*****************************************************************************
405  * Intra prediction for predictive lossless mode.
406  *****************************************************************************/
407
408 /* Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes
409  * that the edge pixels of the reconstructed frame are the same as that of the source frame.  This means
410  * they will only work correctly if the neighboring blocks are losslessly coded.  In practice, this means
411  * lossless mode cannot be mixed with lossy mode within a frame. */
412 /* This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't
413  * need to be done unless we decide to allow mixing lossless and lossy compression. */
414
415 void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
416 {
417     int stride = h->fenc->i_stride[1] << h->mb.b_interlaced;
418     if( i_mode == I_PRED_CHROMA_V )
419     {
420         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 );
421         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 );
422     }
423     else if( i_mode == I_PRED_CHROMA_H )
424     {
425         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 );
426         h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 );
427     }
428     else
429     {
430         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
431         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
432     }
433 }
434
435 void x264_predict_lossless_4x4( x264_t *h, uint8_t *p_dst, int idx, int i_mode )
436 {
437     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
438     uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
439
440     if( i_mode == I_PRED_4x4_V )
441         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
442     else if( i_mode == I_PRED_4x4_H )
443         h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
444     else
445         h->predict_4x4[i_mode]( p_dst );
446 }
447
448 void x264_predict_lossless_8x8( x264_t *h, uint8_t *p_dst, int idx, int i_mode, uint8_t edge[33] )
449 {
450     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
451     uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride;
452
453     if( i_mode == I_PRED_8x8_V )
454         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
455     else if( i_mode == I_PRED_8x8_H )
456         h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
457     else
458         h->predict_8x8[i_mode]( p_dst, edge );
459 }
460
461 void x264_predict_lossless_16x16( x264_t *h, int i_mode )
462 {
463     int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
464     if( i_mode == I_PRED_16x16_V )
465         h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-stride, stride, 16 );
466     else if( i_mode == I_PRED_16x16_H )
467         h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc_plane[0]-1, stride, 16 );
468     else
469         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
470 }
471
472 /*****************************************************************************
473  * x264_macroblock_encode:
474  *****************************************************************************/
475 void x264_macroblock_encode( x264_t *h )
476 {
477     int i_cbp_dc = 0;
478     int i_qp = h->mb.i_qp;
479     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
480     int b_force_no_skip = 0;
481     int i,idx,nz;
482     h->mb.i_cbp_luma = 0;
483     h->mb.cache.non_zero_count[x264_scan8[24]] = 0;
484
485     if( h->sh.b_mbaff
486         && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride
487         && IS_SKIP(h->mb.type[h->sh.i_first_mb]) )
488     {
489         /* The first skip is predicted to be a frame mb pair.
490          * We don't yet support the aff part of mbaff, so force it to non-skip
491          * so that we can pick the aff flag. */
492         b_force_no_skip = 1;
493         if( IS_SKIP(h->mb.i_type) )
494         {
495             if( h->mb.i_type == P_SKIP )
496                 h->mb.i_type = P_L0;
497             else if( h->mb.i_type == B_SKIP )
498                 h->mb.i_type = B_DIRECT;
499         }
500     }
501
502     if( h->mb.i_type == P_SKIP )
503     {
504         /* A bit special */
505         x264_macroblock_encode_pskip( h );
506         return;
507     }
508     if( h->mb.i_type == B_SKIP )
509     {
510         /* don't do bskip motion compensation if it was already done in macroblock_analyse */
511         if( !h->mb.b_skip_mc )
512             x264_mb_mc( h );
513         x264_macroblock_encode_skip( h );
514         return;
515     }
516
517     if( h->mb.i_type == I_16x16 )
518     {
519         const int i_mode = h->mb.i_intra16x16_pred_mode;
520         h->mb.b_transform_8x8 = 0;
521
522         if( h->mb.b_lossless )
523             x264_predict_lossless_16x16( h, i_mode );
524         else
525             h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0] );
526
527         /* encode the 16x16 macroblock */
528         x264_mb_encode_i16x16( h, i_qp );
529     }
530     else if( h->mb.i_type == I_8x8 )
531     {
532         DECLARE_ALIGNED_16( uint8_t edge[33] );
533         h->mb.b_transform_8x8 = 1;
534         /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
535         if( h->mb.i_skip_intra )
536         {
537             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
538             *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0];
539             *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1];
540             *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2];
541             *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3];
542             h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
543             /* In RD mode, restore the now-overwritten DCT data. */
544             if( h->mb.i_skip_intra == 2 )
545                 h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
546         }
547         for( i = h->mb.i_skip_intra ? 3 : 0 ; i < 4; i++ )
548         {
549             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
550             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
551             x264_predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
552
553             if( h->mb.b_lossless )
554                 x264_predict_lossless_8x8( h, p_dst, i, i_mode, edge );
555             else
556                 h->predict_8x8[i_mode]( p_dst, edge );
557
558             x264_mb_encode_i8x8( h, i, i_qp );
559         }
560     }
561     else if( h->mb.i_type == I_4x4 )
562     {
563         h->mb.b_transform_8x8 = 0;
564         /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
565         if( h->mb.i_skip_intra )
566         {
567             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
568             *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0];
569             *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1];
570             *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2];
571             *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3];
572             h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
573             /* In RD mode, restore the now-overwritten DCT data. */
574             if( h->mb.i_skip_intra == 2 )
575                 h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
576         }
577         for( i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
578         {
579             uint8_t  *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
580             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
581
582             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
583                 /* emulate missing topright samples */
584                 *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
585
586             if( h->mb.b_lossless )
587                 x264_predict_lossless_4x4( h, p_dst, i, i_mode );
588             else
589                 h->predict_4x4[i_mode]( p_dst );
590             x264_mb_encode_i4x4( h, i, i_qp );
591         }
592     }
593     else    /* Inter MB */
594     {
595         int i8x8, i4x4;
596         int i_decimate_mb = 0;
597
598         /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
599         if( !h->mb.b_skip_mc )
600             x264_mb_mc( h );
601
602         if( h->mb.b_lossless )
603         {
604             if( h->mb.b_transform_8x8 )
605                 for( i8x8 = 0; i8x8 < 4; i8x8++ )
606                 {
607                     int x = 8*(i8x8&1);
608                     int y = 8*(i8x8>>1);
609                     h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
610                                         h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
611                                         h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
612                     nz = array_non_zero( h->dct.luma8x8[i8x8] );
613                     STORE_8x8_NNZ(i8x8,nz);
614                     h->mb.i_cbp_luma |= nz << i8x8;
615                 }
616             else
617                 for( i4x4 = 0; i4x4 < 16; i4x4++ )
618                 {
619                     h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
620                                         h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
621                                         h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
622                     nz = array_non_zero( h->dct.luma4x4[i4x4] );
623                     h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
624                     h->mb.i_cbp_luma |= nz << (i4x4>>2);
625                 }
626         }
627         else if( h->mb.b_transform_8x8 )
628         {
629             DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
630             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
631             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
632             h->nr_count[1] += h->mb.b_noise_reduction * 4;
633
634             for( idx = 0; idx < 4; idx++ )
635             {
636                 if( h->mb.b_noise_reduction )
637                     h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
638                 nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );
639
640                 if( nz )
641                 {
642                     h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] );
643                     if( b_decimate )
644                     {
645                         int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] );
646                         i_decimate_mb += i_decimate_8x8;
647                         if( i_decimate_8x8 >= 4 )
648                             h->mb.i_cbp_luma |= 1<<idx;
649                     }
650                     else
651                         h->mb.i_cbp_luma |= 1<<idx;
652                 }
653             }
654
655             if( i_decimate_mb < 6 && b_decimate )
656             {
657                 h->mb.i_cbp_luma = 0;
658                 *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
659                 *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
660                 *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
661                 *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
662             }
663             else
664             {
665                 for( idx = 0; idx < 4; idx++ )
666                 {
667                     if( h->mb.i_cbp_luma&(1<<idx) )
668                     {
669                         h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
670                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] );
671                         STORE_8x8_NNZ(idx,1);
672                     }
673                     else
674                         STORE_8x8_NNZ(idx,0);
675                 }
676             }
677         }
678         else
679         {
680             DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
681             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
682             h->nr_count[0] += h->mb.b_noise_reduction * 16;
683
684             for( i8x8 = 0; i8x8 < 4; i8x8++ )
685             {
686                 int i_decimate_8x8 = 0;
687                 int cbp = 0;
688
689                 /* encode one 4x4 block */
690                 for( i4x4 = 0; i4x4 < 4; i4x4++ )
691                 {
692                     idx = i8x8 * 4 + i4x4;
693
694                     if( h->mb.b_noise_reduction )
695                         h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
696                     nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
697                     h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
698
699                     if( nz )
700                     {
701                         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
702                         h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
703                         if( b_decimate && i_decimate_8x8 < 6 )
704                             i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
705                         cbp = 1;
706                     }
707                 }
708
709                 /* decimate this 8x8 block */
710                 i_decimate_mb += i_decimate_8x8;
711                 if( b_decimate )
712                 {
713                     if( i_decimate_8x8 < 4 )
714                         STORE_8x8_NNZ(i8x8,0)
715                     else
716                         h->mb.i_cbp_luma |= 1<<i8x8;
717                 }
718                 else if( cbp )
719                 {
720                     h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
721                     h->mb.i_cbp_luma |= 1<<i8x8;
722                 }
723             }
724
725             if( b_decimate )
726             {
727                 if( i_decimate_mb < 6 )
728                 {
729                     h->mb.i_cbp_luma = 0;
730                     *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
731                     *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
732                     *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
733                     *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
734                 }
735                 else
736                 {
737                     for( i8x8 = 0; i8x8 < 4; i8x8++ )
738                         if( h->mb.i_cbp_luma&(1<<i8x8) )
739                             h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
740                 }
741             }
742         }
743     }
744
745     /* encode chroma */
746     if( IS_INTRA( h->mb.i_type ) )
747     {
748         const int i_mode = h->mb.i_chroma_pred_mode;
749         if( h->mb.b_lossless )
750             x264_predict_lossless_8x8_chroma( h, i_mode );
751         else
752         {
753             h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
754             h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
755         }
756     }
757
758     /* encode the 8x8 blocks */
759     x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
760
761     if( h->param.b_cabac )
762     {
763         i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]]
764                  | h->mb.cache.non_zero_count[x264_scan8[25]] << 1
765                  | h->mb.cache.non_zero_count[x264_scan8[26]] << 2;
766     }
767
768     /* store cbp */
769     h->mb.cbp[h->mb.i_mb_xy] = (i_cbp_dc << 8) | (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma;
770
771     /* Check for P_SKIP
772      * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
773      *      (if multiple mv give same result)*/
774     if( !b_force_no_skip )
775     {
776         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
777             !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
778             *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv
779             && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
780         {
781             h->mb.i_type = P_SKIP;
782         }
783
784         /* Check for B_SKIP */
785         if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
786         {
787             h->mb.i_type = B_SKIP;
788         }
789     }
790 }
791
792 /*****************************************************************************
793  * x264_macroblock_probe_skip:
794  *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
795  *  the previous QP
796  *****************************************************************************/
797 int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
798 {
799     DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
800     DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
801     DECLARE_ALIGNED_16( int16_t dctscan[16] );
802
803     int i_qp = h->mb.i_qp;
804     int mvp[2];
805     int ch, thresh, ssd;
806
807     int i8x8, i4x4;
808     int i_decimate_mb;
809
810     if( !b_bidir )
811     {
812         /* Get the MV */
813         mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
814         mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
815
816         /* Motion compensation */
817         h->mc.mc_luma( h->mb.pic.p_fdec[0],    FDEC_STRIDE,
818                        h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
819                        mvp[0], mvp[1], 16, 16 );
820     }
821
822     for( i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
823     {
824         int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
825         int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
826         /* get luma diff */
827         h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[0] + fenc_offset,
828                                     h->mb.pic.p_fdec[0] + fdec_offset );
829         /* encode one 4x4 block */
830         for( i4x4 = 0; i4x4 < 4; i4x4++ )
831         {
832             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) )
833                 continue;
834             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
835             i_decimate_mb += h->quantf.decimate_score16( dctscan );
836             if( i_decimate_mb >= 6 )
837                 return 0;
838         }
839     }
840
841     /* encode chroma */
842     i_qp = h->mb.i_chroma_qp;
843     thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
844
845     for( ch = 0; ch < 2; ch++ )
846     {
847         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
848         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
849
850         if( !b_bidir )
851         {
852             h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch],       FDEC_STRIDE,
853                              h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
854                              mvp[0], mvp[1], 8, 8 );
855         }
856
857         /* there is almost never a termination during chroma, but we can't avoid the check entirely */
858         /* so instead we check SSD and skip the actual check if the score is low enough. */
859         ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
860         if( ssd < thresh )
861             continue;
862
863         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
864
865         /* calculate dct DC */
866         dct2x2dc( dct2x2, dct4x4 );
867         if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
868             return 0;
869
870         /* If there wasn't a termination in DC, we can check against a much higher threshold. */
871         if( ssd < thresh*4 )
872             continue;
873
874         /* calculate dct coeffs */
875         for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
876         {
877             if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
878                 continue;
879             h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
880             i_decimate_mb += h->quantf.decimate_score15( dctscan );
881             if( i_decimate_mb >= 7 )
882                 return 0;
883         }
884     }
885
886     h->mb.b_skip_mc = 1;
887     return 1;
888 }
889
890 /****************************************************************************
891  * DCT-domain noise reduction / adaptive deadzone
892  * from libavcodec
893  ****************************************************************************/
894
895 void x264_noise_reduction_update( x264_t *h )
896 {
897     int cat, i;
898     for( cat = 0; cat < 2; cat++ )
899     {
900         int size = cat ? 64 : 16;
901         const uint16_t *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
902
903         if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
904         {
905             for( i = 0; i < size; i++ )
906                 h->nr_residual_sum[cat][i] >>= 1;
907             h->nr_count[cat] >>= 1;
908         }
909
910         for( i = 0; i < size; i++ )
911             h->nr_offset[cat][i] =
912                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
913                  + h->nr_residual_sum[cat][i]/2)
914               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
915     }
916 }
917
918 /*****************************************************************************
919  * RD only; 4 calls to this do not make up for one macroblock_encode.
920  * doesn't transform chroma dc.
921  *****************************************************************************/
922 void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
923 {
924     int i_qp = h->mb.i_qp;
925     uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
926     uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
927     int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate;
928     int nnz8x8 = 0;
929     int ch, nz;
930
931     x264_mb_mc_8x8( h, i8 );
932
933     if( h->mb.b_lossless )
934     {
935         int i4;
936         if( h->mb.b_transform_8x8 )
937         {
938             h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
939             nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
940             STORE_8x8_NNZ(i8,nnz8x8);
941         }
942         else
943         {
944             for( i4 = i8*4; i4 < i8*4+4; i4++ )
945             {
946                 int nz;
947                 h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
948                                     h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
949                                     h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
950                 nz = array_non_zero( h->dct.luma4x4[i4] );
951                 h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
952                 nnz8x8 |= nz;
953             }
954         }
955         for( ch = 0; ch < 2; ch++ )
956         {
957             p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
958             p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
959             h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec );
960             h->dct.luma4x4[16+i8+ch*4][0] = 0;
961             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
962         }
963     }
964     else
965     {
966         if( h->mb.b_transform_8x8 )
967         {
968             DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
969             h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
970             nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
971             if( nnz8x8 )
972             {
973                 h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 );
974
975                 if( b_decimate && !h->mb.b_trellis )
976                     nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] );
977
978                 if( nnz8x8 )
979                 {
980                     h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
981                     h->dctf.add8x8_idct8( p_fdec, dct8x8 );
982                     STORE_8x8_NNZ(i8,1);
983                 }
984                 else
985                     STORE_8x8_NNZ(i8,0);
986             }
987             else
988                 STORE_8x8_NNZ(i8,0);
989         }
990         else
991         {
992             int i4;
993             int i_decimate_8x8 = 0;
994             DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
995             h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
996             for( i4 = 0; i4 < 4; i4++ )
997             {
998                 nz = x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 );
999                 h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz;
1000                 if( nz )
1001                 {
1002                     h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
1003                     h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
1004                     if( b_decimate )
1005                         i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
1006                     nnz8x8 = 1;
1007                 }
1008             }
1009
1010             if( b_decimate && i_decimate_8x8 < 4 )
1011                 nnz8x8 = 0;
1012
1013             if( nnz8x8 )
1014                 h->dctf.add8x8_idct( p_fdec, dct4x4 );
1015             else
1016                 STORE_8x8_NNZ(i8,0);
1017         }
1018
1019         i_qp = h->mb.i_chroma_qp;
1020
1021         for( ch = 0; ch < 2; ch++ )
1022         {
1023             DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
1024             p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
1025             p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
1026
1027             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1028             dct4x4[0][0] = 0;
1029
1030             if( h->mb.b_trellis )
1031                 nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 );
1032             else
1033                 nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
1034
1035             h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
1036             if( nz )
1037             {
1038                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
1039                 h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
1040                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
1041             }
1042         }
1043     }
1044     h->mb.i_cbp_luma &= ~(1 << i8);
1045     h->mb.i_cbp_luma |= nnz8x8 << i8;
1046     h->mb.i_cbp_chroma = 0x02;
1047 }
1048
1049 /*****************************************************************************
1050  * RD only, luma only
1051  *****************************************************************************/
1052 void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
1053 {
1054     int i_qp = h->mb.i_qp;
1055     uint8_t *p_fenc = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[i4]];
1056     uint8_t *p_fdec = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
1057     const int i_ref = h->mb.cache.ref[0][x264_scan8[i4]];
1058     const int mvx   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][0], h->mb.mv_min[0], h->mb.mv_max[0] );
1059     const int mvy   = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][1], h->mb.mv_min[1], h->mb.mv_max[1] );
1060     int nz;
1061
1062     h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 );
1063
1064     if( h->mb.b_lossless )
1065     {
1066         h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
1067         h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] );
1068     }
1069     else
1070     {
1071         DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
1072         h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
1073         nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
1074         h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
1075         if( nz )
1076         {
1077             h->zigzagf.scan_4x4( h->dct.luma4x4[i4], dct4x4 );
1078             h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp );
1079             h->dctf.add4x4_idct( p_fdec, dct4x4 );
1080         }
1081     }
1082 }