1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
32 #include "common/common.h"
33 #include "common/cpu.h"
34 #include "macroblock.h"
36 #include "ratecontrol.h"
49 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
50 ALIGNED_4( int16_t mvc[32][5][2] );
54 int i_cost4x4[4]; /* cost per 8x8 partition */
55 x264_me_t me4x4[4][4];
58 int i_cost8x4[4]; /* cost per 8x8 partition */
59 x264_me_t me8x4[4][2];
62 int i_cost4x8[4]; /* cost per 8x8 partition */
63 x264_me_t me4x8[4][2];
73 } x264_mb_analysis_list_t;
77 /* conduct the analysis using this lamda and QP */
82 uint16_t *p_cost_ref0;
83 uint16_t *p_cost_ref1;
88 /* Take some shortcuts in intra search if intra is deemed unlikely */
94 int i_satd_i16x16_dir[7];
99 int i_satd_i8x8_dir[12][4];
103 int i_predict4x4[16];
108 int i_satd_i8x8chroma;
109 int i_satd_i8x8chroma_dir[4];
110 int i_predict8x8chroma;
112 /* II: Inter part P/B frame */
113 x264_mb_analysis_list_t l0;
114 x264_mb_analysis_list_t l1;
116 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
117 int i_cost16x16direct;
119 int i_cost8x8direct[4];
128 int i_mb_partition16x8[2]; /* mb_partition_e */
129 int i_mb_partition8x16[2];
130 int i_mb_type16x8; /* mb_class_e */
133 int b_direct_available;
135 } x264_mb_analysis_t;
137 /* lambda = pow(2,qp/6-2) */
138 const int x264_lambda_tab[52] = {
139 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
140 1, 1, 1, 1, /* 8-11 */
141 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
142 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
143 6, 7, 8, 9,10,11,13,14, /* 28-35 */
144 16,18,20,23,25,29,32,36, /* 36-43 */
145 40,45,51,57,64,72,81,91 /* 44-51 */
148 /* lambda2 = pow(lambda,2) * .9 * 256 */
149 const int x264_lambda2_tab[52] = {
150 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
151 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
152 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
153 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
154 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
155 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
156 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
159 const uint8_t x264_exp2_lut[64] = {
160 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47,
161 50, 53, 57, 60, 64, 67, 71, 74, 78, 81, 85, 89, 93, 96, 100, 104,
162 108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
163 177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
166 const float x264_log2_lut[128] = {
167 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
168 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
169 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
170 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
171 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
172 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
173 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
174 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
175 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
176 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
177 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
178 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
179 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
180 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
181 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
182 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
185 /* Avoid an int/float conversion. */
186 const float x264_log2_lz_lut[32] = {
187 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
190 // should the intra and inter lambdas be different?
191 // I'm just matching the behaviour of deadzone quant.
192 static const int x264_trellis_lambda2_tab[2][52] = {
193 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
194 { 46, 58, 73, 92, 117, 147,
195 185, 233, 294, 370, 466, 587,
196 740, 932, 1174, 1480, 1864, 2349,
197 2959, 3728, 4697, 5918, 7457, 9395,
198 11837, 14914, 18790, 23674, 29828, 37581,
199 47349, 59656, 75163, 94699, 119313, 150326,
200 189399, 238627, 300652, 378798, 477255, 601304,
201 757596, 954511, 1202608, 1515192, 1909022, 2405217,
202 3030384, 3818045, 4810435, 6060769 },
203 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
204 { 27, 34, 43, 54, 68, 86,
205 108, 136, 172, 216, 273, 343,
206 433, 545, 687, 865, 1090, 1374,
207 1731, 2180, 2747, 3461, 4361, 5494,
208 6922, 8721, 10988, 13844, 17442, 21976,
209 27688, 34885, 43953, 55377, 69771, 87906,
210 110755, 139543, 175813, 221511, 279087, 351627,
211 443023, 558174, 703255, 886046, 1116348, 1406511,
212 1772093, 2232697, 2813022, 3544186 }
215 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
216 16, 20, 25, 32, 40, 50,
217 64, 80, 101, 128, 161, 203,
218 256, 322, 406, 512, 645, 812,
219 1024, 1290, 1625, 2048, 2580, 3250,
220 4096, 5160, 6501, 8192, 10321, 13003,
221 16384, 20642, 26007, 32768, 41285, 52015,
225 /* TODO: calculate CABAC costs */
226 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
227 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
229 static const int i_mb_b16x8_cost_table[17] = {
230 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
232 static const int i_sub_mb_b_cost_table[13] = {
233 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
235 static const int i_sub_mb_p_cost_table[4] = {
239 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
241 /* Indexed by lambda instead of qp because, due to rounding,
242 * some quantizers share lambdas. This saves memory. */
243 uint16_t *x264_cost_mv_fpel[92][4];
244 uint16_t x264_cost_ref[92][3][33];
246 /* initialize an array of lambda*nbits for all possible mvs */
247 static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
249 static int16_t *p_cost_mv[92];
252 if( !p_cost_mv[a->i_lambda] )
255 /* could be faster, but isn't called many times */
256 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
257 CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
258 p_cost_mv[a->i_lambda] += 2*4*2048;
259 for( i = 0; i <= 2*4*2048; i++ )
261 p_cost_mv[a->i_lambda][-i] =
262 p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
264 for( i = 0; i < 3; i++ )
265 for( j = 0; j < 33; j++ )
266 x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
268 a->p_cost_mv = p_cost_mv[a->i_lambda];
269 a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
270 a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
272 /* FIXME is this useful for all me methods? */
273 if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
277 CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
278 x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
279 for( i = -2*2048; i < 2*2048; i++ )
280 x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
288 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
290 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
292 /* mbrd == 1 -> RD mode decision */
293 /* mbrd == 2 -> RD refinement */
294 /* mbrd == 3 -> QPRD */
295 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
297 /* conduct the analysis using this lamda and QP */
298 a->i_qp = h->mb.i_qp = i_qp;
299 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
301 a->i_lambda = x264_lambda_tab[i_qp];
302 a->i_lambda2 = x264_lambda2_tab[i_qp];
304 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
305 if( h->param.analyse.i_trellis )
307 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
308 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
309 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
310 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
312 h->mb.i_psy_rd_lambda = a->i_lambda;
313 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
314 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
316 h->mb.i_me_method = h->param.analyse.i_me_method;
317 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
318 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
319 && h->mb.i_subpel_refine >= 5;
321 h->mb.b_transform_8x8 = 0;
322 h->mb.b_noise_reduction = 0;
328 a->i_satd_i8x8chroma = COST_MAX;
330 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
331 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
335 h->mb.b_lossless ? 0 :
337 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
339 /* II: Inter part P/B frame */
340 if( h->sh.i_type != SLICE_TYPE_I )
343 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
344 // limit motion search to a slightly smaller range than the theoretical limit,
345 // since the search may go a few iterations past its given range
346 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
348 /* Calculate max allowed MV range */
349 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
350 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
351 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
352 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
353 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
354 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
355 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
356 if( h->mb.i_mb_x == 0)
358 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
359 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
360 int thread_mvy_range = i_fmv_range;
362 if( h->param.i_threads > 1 )
364 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
365 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
366 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
368 x264_frame_t **fref = i ? h->fref1 : h->fref0;
369 int i_ref = i ? h->i_ref1 : h->i_ref0;
370 for( j=0; j<i_ref; j++ )
372 x264_frame_cond_wait( fref[j], thresh );
373 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
376 if( h->param.b_deterministic )
377 thread_mvy_range = h->param.analyse.i_mv_range_thread;
378 if( h->mb.b_interlaced )
379 thread_mvy_range >>= 1;
382 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
383 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
384 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
385 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
386 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
387 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
388 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
394 a->l0.i_cost8x8 = COST_MAX;
396 for( i = 0; i < 4; i++ )
400 a->l0.i_cost4x8[i] = COST_MAX;
404 a->l0.i_cost8x16 = COST_MAX;
405 if( h->sh.i_type == SLICE_TYPE_B )
409 a->l1.i_cost8x8 = COST_MAX;
411 for( i = 0; i < 4; i++ )
416 a->i_cost8x8direct[i] = COST_MAX;
427 a->i_cost16x16direct =
430 a->i_cost8x16bi = COST_MAX;
433 /* Fast intra decision */
434 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
436 if( IS_INTRA( h->mb.i_mb_type_left )
437 || IS_INTRA( h->mb.i_mb_type_top )
438 || IS_INTRA( h->mb.i_mb_type_topleft )
439 || IS_INTRA( h->mb.i_mb_type_topright )
440 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
441 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
442 { /* intra is likely */ }
458 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
460 if( i_neighbour & MB_TOPLEFT )
462 /* top and left available */
463 *mode++ = I_PRED_16x16_V;
464 *mode++ = I_PRED_16x16_H;
465 *mode++ = I_PRED_16x16_DC;
466 *mode++ = I_PRED_16x16_P;
469 else if( i_neighbour & MB_LEFT )
472 *mode++ = I_PRED_16x16_DC_LEFT;
473 *mode++ = I_PRED_16x16_H;
476 else if( i_neighbour & MB_TOP )
479 *mode++ = I_PRED_16x16_DC_TOP;
480 *mode++ = I_PRED_16x16_V;
486 *mode = I_PRED_16x16_DC_128;
492 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
494 if( i_neighbour & MB_TOPLEFT )
496 /* top and left available */
497 *mode++ = I_PRED_CHROMA_V;
498 *mode++ = I_PRED_CHROMA_H;
499 *mode++ = I_PRED_CHROMA_DC;
500 *mode++ = I_PRED_CHROMA_P;
503 else if( i_neighbour & MB_LEFT )
506 *mode++ = I_PRED_CHROMA_DC_LEFT;
507 *mode++ = I_PRED_CHROMA_H;
510 else if( i_neighbour & MB_TOP )
513 *mode++ = I_PRED_CHROMA_DC_TOP;
514 *mode++ = I_PRED_CHROMA_V;
520 *mode = I_PRED_CHROMA_DC_128;
526 static void predict_4x4_mode_available( unsigned int i_neighbour,
527 int *mode, int *pi_count )
529 int b_l = i_neighbour & MB_LEFT;
530 int b_t = i_neighbour & MB_TOP;
535 *mode++ = I_PRED_4x4_DC;
536 *mode++ = I_PRED_4x4_H;
537 *mode++ = I_PRED_4x4_V;
538 *mode++ = I_PRED_4x4_DDL;
539 if( i_neighbour & MB_TOPLEFT )
541 *mode++ = I_PRED_4x4_DDR;
542 *mode++ = I_PRED_4x4_VR;
543 *mode++ = I_PRED_4x4_HD;
546 *mode++ = I_PRED_4x4_VL;
547 *mode++ = I_PRED_4x4_HU;
551 *mode++ = I_PRED_4x4_DC_LEFT;
552 *mode++ = I_PRED_4x4_H;
553 *mode++ = I_PRED_4x4_HU;
558 *mode++ = I_PRED_4x4_DC_TOP;
559 *mode++ = I_PRED_4x4_V;
560 *mode++ = I_PRED_4x4_DDL;
561 *mode++ = I_PRED_4x4_VL;
566 *mode++ = I_PRED_4x4_DC_128;
571 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
572 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
574 ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] );
575 ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
576 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
579 if( do_both_dct || h->mb.b_transform_8x8 )
581 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
582 for( i = 0; i < 4; i++ )
583 h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
585 if( do_both_dct || !h->mb.b_transform_8x8 )
587 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
588 for( i = 0; i < 16; i++ )
589 h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
593 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
594 static inline void x264_mb_cache_fenc_satd( x264_t *h )
596 ALIGNED_16( static uint8_t zero[16] ) = {0};
598 int x, y, satd_sum = 0, sa8d_sum = 0;
599 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
600 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
601 if( !h->mb.i_psy_rd )
603 for( y = 0; y < 4; y++ )
604 for( x = 0; x < 4; x++ )
606 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
607 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
608 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
609 satd_sum += h->mb.pic.fenc_satd[y][x];
611 for( y = 0; y < 2; y++ )
612 for( x = 0; x < 2; x++ )
614 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
615 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
616 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
617 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
619 h->mb.pic.fenc_satd_sum = satd_sum;
620 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
623 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
629 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
631 uint8_t *p_dstc[2], *p_srcc[2];
633 if( a->i_satd_i8x8chroma < COST_MAX )
636 /* 8x8 prediction selection for chroma */
637 p_dstc[0] = h->mb.pic.p_fdec[1];
638 p_dstc[1] = h->mb.pic.p_fdec[2];
639 p_srcc[0] = h->mb.pic.p_fenc[1];
640 p_srcc[1] = h->mb.pic.p_fenc[2];
642 predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
643 a->i_satd_i8x8chroma = COST_MAX;
644 if( i_max == 4 && b_merged_satd )
646 int satdu[4], satdv[4];
647 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
648 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
649 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
650 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
651 satdu[I_PRED_CHROMA_P] =
652 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
653 satdv[I_PRED_CHROMA_P] =
654 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
656 for( i=0; i<i_max; i++ )
658 int i_mode = predict_mode[i];
659 int i_satd = satdu[i_mode] + satdv[i_mode]
660 + a->i_lambda * bs_size_ue(i_mode);
662 a->i_satd_i8x8chroma_dir[i] = i_satd;
663 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
668 for( i=0; i<i_max; i++ )
671 int i_mode = predict_mode[i];
673 /* we do the prediction */
674 if( h->mb.b_lossless )
675 x264_predict_lossless_8x8_chroma( h, i_mode );
678 h->predict_8x8c[i_mode]( p_dstc[0] );
679 h->predict_8x8c[i_mode]( p_dstc[1] );
682 /* we calculate the cost */
683 i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
684 p_srcc[0], FENC_STRIDE ) +
685 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
686 p_srcc[1], FENC_STRIDE ) +
687 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
689 a->i_satd_i8x8chroma_dir[i] = i_satd;
690 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
694 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
697 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
699 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
700 uint8_t *p_src = h->mb.pic.p_fenc[0];
701 uint8_t *p_dst = h->mb.pic.p_fdec[0];
706 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
708 /*---------------- Try all mode and calculate their score ---------------*/
710 /* 16x16 prediction selection */
711 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
713 if( b_merged_satd && i_max == 4 )
715 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
716 h->predict_16x16[I_PRED_16x16_P]( p_dst );
717 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
718 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
721 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
722 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
727 for( i = 0; i < i_max; i++ )
730 int i_mode = predict_mode[i];
732 if( h->mb.b_lossless )
733 x264_predict_lossless_16x16( h, i_mode );
735 h->predict_16x16[i_mode]( p_dst );
737 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
738 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
739 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
740 a->i_satd_i16x16_dir[i_mode] = i_satd;
744 if( h->sh.i_type == SLICE_TYPE_B )
745 /* cavlc mb type prefix */
746 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
747 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
750 /* 8x8 prediction selection */
751 if( flags & X264_ANALYSE_I8x8 )
753 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
754 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
755 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
757 h->mb.i_cbp_luma = 0;
758 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
760 // FIXME some bias like in i4x4?
761 if( h->sh.i_type == SLICE_TYPE_B )
762 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
764 for( idx = 0;; idx++ )
768 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
769 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
770 int i_best = COST_MAX;
771 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
773 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
774 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
776 if( b_merged_satd && i_max == 9 )
779 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
780 satd[i_pred_mode] -= 3 * a->i_lambda;
781 for( i=2; i>=0; i-- )
783 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
784 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
791 for( ; i<i_max; i++ )
794 int i_mode = predict_mode[i];
796 if( h->mb.b_lossless )
797 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
799 h->predict_8x8[i_mode]( p_dst_by, edge );
801 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
802 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
804 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
805 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
809 if( idx == 3 || i_cost > i_satd_thresh )
812 /* we need to encode this block now (for next ones) */
813 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
814 x264_mb_encode_i8x8( h, idx, a->i_qp );
816 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
821 a->i_satd_i8x8 = i_cost;
822 if( h->mb.i_skip_intra )
824 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
825 h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
826 h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
827 h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
828 h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
829 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
830 if( h->mb.i_skip_intra == 2 )
831 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
836 static const uint16_t cost_div_fix8[3] = {1024,512,341};
837 a->i_satd_i8x8 = COST_MAX;
838 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
840 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
844 /* 4x4 prediction selection */
845 if( flags & X264_ANALYSE_I4x4 )
848 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
849 h->mb.i_cbp_luma = 0;
850 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
852 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
854 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
855 if( h->sh.i_type == SLICE_TYPE_B )
856 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
858 for( idx = 0;; idx++ )
860 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
861 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
862 int i_best = COST_MAX;
863 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
865 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
867 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
868 /* emulate missing topright samples */
869 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
871 if( b_merged_satd && i_max >= 6 )
874 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
875 satd[i_pred_mode] -= 3 * a->i_lambda;
876 for( i=2; i>=0; i-- )
877 COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
878 a->i_predict4x4[idx], i );
884 for( ; i<i_max; i++ )
887 int i_mode = predict_mode[i];
888 if( h->mb.b_lossless )
889 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
891 h->predict_4x4[i_mode]( p_dst_by );
893 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
894 p_src_by, FENC_STRIDE )
895 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
897 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
901 if( i_cost > i_satd_thresh || idx == 15 )
904 /* we need to encode this block now (for next ones) */
905 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
906 x264_mb_encode_i4x4( h, idx, a->i_qp );
908 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
912 a->i_satd_i4x4 = i_cost;
913 if( h->mb.i_skip_intra )
915 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
916 h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
917 h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
918 h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
919 h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
920 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
921 if( h->mb.i_skip_intra == 2 )
922 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
926 a->i_satd_i4x4 = COST_MAX;
930 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
932 if( a->i_satd_i16x16 <= i_satd_thresh )
934 h->mb.i_type = I_16x16;
935 x264_analyse_update_cache( h, a );
936 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
939 a->i_satd_i16x16 = COST_MAX;
941 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
943 h->mb.i_type = I_4x4;
944 x264_analyse_update_cache( h, a );
945 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
948 a->i_satd_i4x4 = COST_MAX;
950 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
952 h->mb.i_type = I_8x8;
953 x264_analyse_update_cache( h, a );
954 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
955 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
958 a->i_satd_i8x8 = COST_MAX;
961 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
963 uint8_t *p_dst = h->mb.pic.p_fdec[0];
966 int i_max, i_mode, i_thresh;
967 uint64_t i_satd, i_best;
969 h->mb.i_skip_intra = 0;
971 if( h->mb.i_type == I_16x16 )
973 int old_pred_mode = a->i_predict16x16;
974 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
975 i_best = a->i_satd_i16x16;
976 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
977 for( i = 0; i < i_max; i++ )
979 int i_mode = predict_mode[i];
980 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
982 h->mb.i_intra16x16_pred_mode = i_mode;
983 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
984 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
988 /* RD selection for chroma prediction */
989 predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
992 i_thresh = a->i_satd_i8x8chroma * 5/4;
994 for( i = j = 0; i < i_max; i++ )
995 if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
996 predict_mode[i] != a->i_predict8x8chroma )
998 predict_mode[j++] = predict_mode[i];
1004 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1005 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1006 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1007 * coefs for the current chroma mode are still around, so we only
1008 * have to recount the bits. */
1009 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1010 for( i = 0; i < i_max; i++ )
1012 i_mode = predict_mode[i];
1013 if( h->mb.b_lossless )
1014 x264_predict_lossless_8x8_chroma( h, i_mode );
1017 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
1018 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
1020 /* if we've already found a mode that needs no residual, then
1021 * probably any mode with a residual will be worse.
1022 * so avoid dct on the remaining modes to improve speed. */
1023 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1024 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1026 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1027 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1031 if( h->mb.i_type == I_4x4 )
1033 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1035 for( idx = 0; idx < 16; idx++ )
1037 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1038 i_best = COST_MAX64;
1040 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1042 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1043 /* emulate missing topright samples */
1044 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1046 for( i = 0; i < i_max; i++ )
1048 i_mode = predict_mode[i];
1049 if( h->mb.b_lossless )
1050 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1052 h->predict_4x4[i_mode]( p_dst_by );
1053 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1055 if( i_best > i_satd )
1057 a->i_predict4x4[idx] = i_mode;
1059 pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
1060 pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
1061 pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
1062 pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
1063 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1067 *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
1068 *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
1069 *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
1070 *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
1071 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1073 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1076 else if( h->mb.i_type == I_8x8 )
1078 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1079 for( idx = 0; idx < 4; idx++ )
1081 uint64_t pels_h = 0;
1086 int cbp_luma_new = 0;
1087 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1089 i_best = COST_MAX64;
1093 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1094 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1095 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1097 for( i = 0; i < i_max; i++ )
1099 i_mode = predict_mode[i];
1100 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1102 if( h->mb.b_lossless )
1103 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1105 h->predict_8x8[i_mode]( p_dst_by, edge );
1106 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1107 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1109 if( i_best > i_satd )
1111 a->i_predict8x8[idx] = i_mode;
1112 cbp_luma_new = h->mb.i_cbp_luma;
1115 pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
1117 for( j=0; j<7; j++ )
1118 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1119 i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
1120 i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
1123 a->i_cbp_i8x8_luma = cbp_luma_new;
1124 *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
1126 for( j=0; j<7; j++ )
1127 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1128 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
1129 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
1131 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1136 #define LOAD_FENC( m, src, xoff, yoff) \
1137 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1138 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1139 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1140 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1141 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1143 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1144 (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1145 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1146 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1147 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1148 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1149 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1150 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
1152 #define REF_COST(list, ref) \
1153 (a->p_cost_ref##list[ref])
1155 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1159 ALIGNED_4( int16_t mvc[8][2] );
1160 int i_halfpel_thresh = INT_MAX;
1161 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1163 /* 16x16 Search on all ref frame */
1164 m.i_pixel = PIXEL_16x16;
1165 m.p_cost_mv = a->p_cost_mv;
1166 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1168 a->l0.me16x16.cost = INT_MAX;
1169 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1171 const int i_ref_cost = REF_COST( 0, i_ref );
1172 i_halfpel_thresh -= i_ref_cost;
1173 m.i_ref_cost = i_ref_cost;
1176 /* search with ref */
1177 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1178 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1179 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1180 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1182 /* early termination
1183 * SSD threshold would probably be better than SATD */
1186 && m.cost-m.cost_mv < 300*a->i_lambda
1187 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1188 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1189 && x264_macroblock_probe_pskip( h ) )
1191 h->mb.i_type = P_SKIP;
1192 x264_analyse_update_cache( h, a );
1193 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1197 m.cost += i_ref_cost;
1198 i_halfpel_thresh += i_ref_cost;
1200 if( m.cost < a->l0.me16x16.cost )
1201 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1203 /* save mv for predicting neighbors */
1204 *(uint32_t*)a->l0.mvc[i_ref][0] =
1205 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1208 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1209 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1211 h->mb.i_type = P_L0;
1214 x264_mb_cache_fenc_satd( h );
1215 if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
1217 h->mb.i_partition = D_16x16;
1218 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1219 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1224 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1228 uint8_t **p_fenc = h->mb.pic.p_fenc;
1229 int i_halfpel_thresh = INT_MAX;
1230 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1232 int i_maxref = h->mb.pic.i_fref[0]-1;
1234 h->mb.i_partition = D_8x8;
1236 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1237 * than those used by the neighbors */
1238 if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1239 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1242 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1243 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1244 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1245 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1246 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1247 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1250 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1251 *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
1253 for( i = 0; i < 4; i++ )
1255 x264_me_t *l0m = &a->l0.me8x8[i];
1259 m.i_pixel = PIXEL_8x8;
1260 m.p_cost_mv = a->p_cost_mv;
1262 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1263 l0m->cost = INT_MAX;
1264 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1266 const int i_ref_cost = REF_COST( 0, i_ref );
1267 i_halfpel_thresh -= i_ref_cost;
1268 m.i_ref_cost = i_ref_cost;
1271 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1272 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1273 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1274 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1276 m.cost += i_ref_cost;
1277 i_halfpel_thresh += i_ref_cost;
1278 *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
1280 if( m.cost < l0m->cost )
1281 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1283 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1284 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1287 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1290 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1291 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1292 /* P_8x8 ref0 has no ref cost */
1293 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1294 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1295 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1296 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1297 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1300 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1302 const int i_ref = a->l0.me16x16.i_ref;
1303 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1304 uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref];
1305 uint8_t **p_fenc = h->mb.pic.p_fenc;
1307 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1310 /* XXX Needed for x264_mb_predict_mv */
1311 h->mb.i_partition = D_8x8;
1314 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
1316 for( i = 0; i < 4; i++ )
1318 x264_me_t *m = &a->l0.me8x8[i];
1322 m->i_pixel = PIXEL_8x8;
1323 m->p_cost_mv = a->p_cost_mv;
1324 m->i_ref_cost = i_ref_cost;
1327 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1328 LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
1329 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1330 x264_me_search( h, m, mvc, i_mvc );
1332 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1334 *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
1338 m->cost += i_ref_cost;
1339 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1342 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1343 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1344 /* theoretically this should include 4*ref_cost,
1345 * but 3 seems a better approximation of cabac. */
1346 if( h->param.b_cabac )
1347 a->l0.i_cost8x8 -= i_ref_cost;
1348 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1349 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1352 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1355 uint8_t **p_fenc = h->mb.pic.p_fenc;
1356 ALIGNED_4( int16_t mvc[3][2] );
1359 /* XXX Needed for x264_mb_predict_mv */
1360 h->mb.i_partition = D_16x8;
1362 for( i = 0; i < 2; i++ )
1364 x264_me_t *l0m = &a->l0.me16x8[i];
1365 const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1366 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1368 m.i_pixel = PIXEL_16x8;
1369 m.p_cost_mv = a->p_cost_mv;
1371 LOAD_FENC( &m, p_fenc, 0, 8*i );
1372 l0m->cost = INT_MAX;
1373 for( j = 0; j < i_ref8s; j++ )
1375 const int i_ref = ref8[j];
1376 const int i_ref_cost = REF_COST( 0, i_ref );
1377 m.i_ref_cost = i_ref_cost;
1380 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1381 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1382 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
1383 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
1385 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1386 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1387 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1388 x264_me_search( h, &m, mvc, 3 );
1390 m.cost += i_ref_cost;
1392 if( m.cost < l0m->cost )
1393 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1395 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1396 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1399 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1402 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1405 uint8_t **p_fenc = h->mb.pic.p_fenc;
1406 ALIGNED_4( int16_t mvc[3][2] );
1409 /* XXX Needed for x264_mb_predict_mv */
1410 h->mb.i_partition = D_8x16;
1412 for( i = 0; i < 2; i++ )
1414 x264_me_t *l0m = &a->l0.me8x16[i];
1415 const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1416 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1418 m.i_pixel = PIXEL_8x16;
1419 m.p_cost_mv = a->p_cost_mv;
1421 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1422 l0m->cost = INT_MAX;
1423 for( j = 0; j < i_ref8s; j++ )
1425 const int i_ref = ref8[j];
1426 const int i_ref_cost = REF_COST( 0, i_ref );
1427 m.i_ref_cost = i_ref_cost;
1430 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1431 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
1432 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
1434 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1435 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1436 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1437 x264_me_search( h, &m, mvc, 3 );
1439 m.cost += i_ref_cost;
1441 if( m.cost < l0m->cost )
1442 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1444 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1445 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1448 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1451 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1453 ALIGNED_8( uint8_t pix1[16*8] );
1454 uint8_t *pix2 = pix1+8;
1455 const int i_stride = h->mb.pic.i_stride[1];
1456 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1457 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1459 #define CHROMA4x4MC( width, height, me, x, y ) \
1460 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
1461 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
1463 if( pixel == PIXEL_4x4 )
1465 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
1466 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
1467 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
1468 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
1470 else if( pixel == PIXEL_8x4 )
1472 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
1473 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
1477 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
1478 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
1481 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1482 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1485 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1487 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1488 uint8_t **p_fenc = h->mb.pic.p_fenc;
1489 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1492 /* XXX Needed for x264_mb_predict_mv */
1493 h->mb.i_partition = D_8x8;
1495 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1497 const int idx = 4*i8x8 + i4x4;
1498 const int x4 = block_idx_x[idx];
1499 const int y4 = block_idx_y[idx];
1500 const int i_mvc = (i4x4 == 0);
1502 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1504 m->i_pixel = PIXEL_4x4;
1505 m->p_cost_mv = a->p_cost_mv;
1507 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1508 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1510 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1511 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1513 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1515 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1516 a->l0.me4x4[i8x8][1].cost +
1517 a->l0.me4x4[i8x8][2].cost +
1518 a->l0.me4x4[i8x8][3].cost +
1519 REF_COST( 0, i_ref ) +
1520 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1521 if( h->mb.b_chroma_me )
1522 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1525 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1527 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1528 uint8_t **p_fenc = h->mb.pic.p_fenc;
1529 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1532 /* XXX Needed for x264_mb_predict_mv */
1533 h->mb.i_partition = D_8x8;
1535 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1537 const int idx = 4*i8x8 + 2*i8x4;
1538 const int x4 = block_idx_x[idx];
1539 const int y4 = block_idx_y[idx];
1540 const int i_mvc = (i8x4 == 0);
1542 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1544 m->i_pixel = PIXEL_8x4;
1545 m->p_cost_mv = a->p_cost_mv;
1547 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1548 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1550 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1551 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1553 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1555 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1556 REF_COST( 0, i_ref ) +
1557 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1558 if( h->mb.b_chroma_me )
1559 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1562 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1564 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1565 uint8_t **p_fenc = h->mb.pic.p_fenc;
1566 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1569 /* XXX Needed for x264_mb_predict_mv */
1570 h->mb.i_partition = D_8x8;
1572 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1574 const int idx = 4*i8x8 + i4x8;
1575 const int x4 = block_idx_x[idx];
1576 const int y4 = block_idx_y[idx];
1577 const int i_mvc = (i4x8 == 0);
1579 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1581 m->i_pixel = PIXEL_4x8;
1582 m->p_cost_mv = a->p_cost_mv;
1584 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1585 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1587 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1588 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1590 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1592 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1593 REF_COST( 0, i_ref ) +
1594 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1595 if( h->mb.b_chroma_me )
1596 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1599 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1601 /* Assumes that fdec still contains the results of
1602 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1604 uint8_t **p_fenc = h->mb.pic.p_fenc;
1605 uint8_t **p_fdec = h->mb.pic.p_fdec;
1608 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1609 for( i = 0; i < 4; i++ )
1611 const int x = (i&1)*8;
1612 const int y = (i>>1)*8;
1613 a->i_cost16x16direct +=
1614 a->i_cost8x8direct[i] =
1615 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1618 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1622 #define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
1624 h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
1627 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1629 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1630 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1631 uint8_t *src0, *src1;
1632 int stride0 = 16, stride1 = 16;
1636 ALIGNED_4( int16_t mvc[9][2] );
1637 int i_halfpel_thresh = INT_MAX;
1638 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1640 /* 16x16 Search on all ref frame */
1641 m.i_pixel = PIXEL_16x16;
1642 m.p_cost_mv = a->p_cost_mv;
1643 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1646 a->l0.me16x16.cost = INT_MAX;
1647 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1649 /* search with ref */
1650 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1651 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1652 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1653 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1656 m.cost += REF_COST( 0, i_ref );
1658 if( m.cost < a->l0.me16x16.cost )
1660 a->l0.i_ref = i_ref;
1661 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1664 /* save mv for predicting neighbors */
1665 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1667 /* subtract ref cost, so we don't have to add it for the other MB types */
1668 a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1671 i_halfpel_thresh = INT_MAX;
1672 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1673 a->l1.me16x16.cost = INT_MAX;
1674 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1676 /* search with ref */
1677 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1678 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1679 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1680 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1683 m.cost += REF_COST( 1, i_ref );
1685 if( m.cost < a->l1.me16x16.cost )
1687 a->l1.i_ref = i_ref;
1688 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1691 /* save mv for predicting neighbors */
1692 *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1694 /* subtract ref cost, so we don't have to add it for the other MB types */
1695 a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1697 /* Set global ref, needed for other modes? */
1698 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1699 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1701 /* get cost of BI mode */
1702 src0 = h->mc.get_ref( pix0, &stride0,
1703 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1704 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
1705 src1 = h->mc.get_ref( pix1, &stride1,
1706 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1707 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
1709 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1711 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1712 + REF_COST( 0, a->l0.i_ref )
1713 + REF_COST( 1, a->l1.i_ref )
1714 + a->l0.me16x16.cost_mv
1715 + a->l1.me16x16.cost_mv;
1718 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1719 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1720 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1723 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1725 const int x = 2*(i%2);
1726 const int y = 2*(i/2);
1728 switch( h->mb.i_sub_partition[i] )
1731 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1734 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1735 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1738 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1739 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1742 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1743 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1744 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1745 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1748 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1753 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1754 if( x264_mb_partition_listX_table[0][part] ) \
1756 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1757 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1761 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1762 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1764 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1766 if( x264_mb_partition_listX_table[1][part] ) \
1768 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1769 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1773 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1774 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1776 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1779 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1783 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1785 x264_mb_load_mv_direct8x8( h, i );
1788 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1789 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1790 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1795 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1798 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1800 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1802 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1804 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1808 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1810 uint8_t **p_fref[2] =
1811 { h->mb.pic.p_fref[0][a->l0.i_ref],
1812 h->mb.pic.p_fref[1][a->l1.i_ref] };
1813 ALIGNED_8( uint8_t pix[2][8*8] );
1816 /* XXX Needed for x264_mb_predict_mv */
1817 h->mb.i_partition = D_8x8;
1821 for( i = 0; i < 4; i++ )
1826 int i_part_cost_bi = 0;
1827 int stride[2] = {8,8};
1830 for( l = 0; l < 2; l++ )
1832 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1833 x264_me_t *m = &lX->me8x8[i];
1835 m->i_pixel = PIXEL_8x8;
1836 m->p_cost_mv = a->p_cost_mv;
1838 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1839 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1841 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1842 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1844 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1847 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1848 m->mv[0], m->mv[1], 8, 8 );
1849 i_part_cost_bi += m->cost_mv;
1850 /* FIXME: ref cost */
1852 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1853 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1854 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1855 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1856 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1858 i_part_cost = a->l0.me8x8[i].cost;
1859 h->mb.i_sub_partition[i] = D_L0_8x8;
1860 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1861 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1862 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1863 a->i_cost8x8bi += i_part_cost;
1865 /* XXX Needed for x264_mb_predict_mv */
1866 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1870 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1873 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1875 uint8_t **p_fref[2] =
1876 { h->mb.pic.p_fref[0][a->l0.i_ref],
1877 h->mb.pic.p_fref[1][a->l1.i_ref] };
1878 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1879 ALIGNED_4( int16_t mvc[2][2] );
1882 h->mb.i_partition = D_16x8;
1883 a->i_cost16x8bi = 0;
1885 for( i = 0; i < 2; i++ )
1888 int i_part_cost_bi = 0;
1889 int stride[2] = {16,16};
1892 /* TODO: check only the list(s) that were used in b8x8? */
1893 for( l = 0; l < 2; l++ )
1895 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1896 x264_me_t *m = &lX->me16x8[i];
1898 m->i_pixel = PIXEL_16x8;
1899 m->p_cost_mv = a->p_cost_mv;
1901 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1902 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1904 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
1905 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
1907 x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1908 x264_me_search( h, m, mvc, 2 );
1911 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1912 m->mv[0], m->mv[1], 16, 8 );
1913 /* FIXME: ref cost */
1914 i_part_cost_bi += m->cost_mv;
1916 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1917 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1919 i_part_cost = a->l0.me16x8[i].cost;
1920 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1921 if( a->l1.me16x8[i].cost < i_part_cost )
1923 i_part_cost = a->l1.me16x8[i].cost;
1924 a->i_mb_partition16x8[i] = D_L1_8x8;
1926 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1928 i_part_cost = i_part_cost_bi;
1929 a->i_mb_partition16x8[i] = D_BI_8x8;
1931 a->i_cost16x8bi += i_part_cost;
1933 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1937 a->i_mb_type16x8 = B_L0_L0
1938 + (a->i_mb_partition16x8[0]>>2) * 3
1939 + (a->i_mb_partition16x8[1]>>2);
1940 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1943 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1945 uint8_t **p_fref[2] =
1946 { h->mb.pic.p_fref[0][a->l0.i_ref],
1947 h->mb.pic.p_fref[1][a->l1.i_ref] };
1948 ALIGNED_8( uint8_t pix[2][8*16] );
1949 ALIGNED_4( int16_t mvc[2][2] );
1952 h->mb.i_partition = D_8x16;
1953 a->i_cost8x16bi = 0;
1955 for( i = 0; i < 2; i++ )
1958 int i_part_cost_bi = 0;
1959 int stride[2] = {8,8};
1962 for( l = 0; l < 2; l++ )
1964 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1965 x264_me_t *m = &lX->me8x16[i];
1967 m->i_pixel = PIXEL_8x16;
1968 m->p_cost_mv = a->p_cost_mv;
1970 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1971 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
1973 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
1974 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
1976 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1977 x264_me_search( h, m, mvc, 2 );
1980 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1981 m->mv[0], m->mv[1], 8, 16 );
1982 /* FIXME: ref cost */
1983 i_part_cost_bi += m->cost_mv;
1986 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1987 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1989 i_part_cost = a->l0.me8x16[i].cost;
1990 a->i_mb_partition8x16[i] = D_L0_8x8;
1991 if( a->l1.me8x16[i].cost < i_part_cost )
1993 i_part_cost = a->l1.me8x16[i].cost;
1994 a->i_mb_partition8x16[i] = D_L1_8x8;
1996 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1998 i_part_cost = i_part_cost_bi;
1999 a->i_mb_partition8x16[i] = D_BI_8x8;
2001 a->i_cost8x16bi += i_part_cost;
2003 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2007 a->i_mb_type8x16 = B_L0_L0
2008 + (a->i_mb_partition8x16[0]>>2) * 3
2009 + (a->i_mb_partition8x16[1]>>2);
2010 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2013 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2015 int thresh = i_satd * 5/4;
2017 h->mb.i_type = P_L0;
2018 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2020 h->mb.i_partition = D_16x16;
2021 x264_analyse_update_cache( h, a );
2022 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2025 if( a->l0.i_cost16x8 <= thresh )
2027 h->mb.i_partition = D_16x8;
2028 x264_analyse_update_cache( h, a );
2029 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2032 a->l0.i_cost16x8 = COST_MAX;
2034 if( a->l0.i_cost8x16 <= thresh )
2036 h->mb.i_partition = D_8x16;
2037 x264_analyse_update_cache( h, a );
2038 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2041 a->l0.i_cost8x16 = COST_MAX;
2043 if( a->l0.i_cost8x8 <= thresh )
2045 h->mb.i_type = P_8x8;
2046 h->mb.i_partition = D_8x8;
2047 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2050 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2051 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2052 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2053 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2054 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2055 * for future blocks are those left over from previous RDO calls. */
2056 for( i = 0; i < 4; i++ )
2058 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2059 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2060 int subtype, btype = D_L0_8x8;
2061 uint64_t bcost = COST_MAX64;
2062 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2065 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2067 h->mb.i_sub_partition[i] = subtype;
2068 x264_mb_cache_mv_p8x8( h, a, i );
2069 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2070 COPY2_IF_LT( bcost, cost, btype, subtype );
2072 h->mb.i_sub_partition[i] = btype;
2073 x264_mb_cache_mv_p8x8( h, a, i );
2077 x264_analyse_update_cache( h, a );
2078 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2081 a->l0.i_cost8x8 = COST_MAX;
2084 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2086 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2088 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2090 h->mb.i_type = B_DIRECT;
2091 /* Assumes direct/skip MC is still in fdec */
2092 /* Requires b-rdo to be done before intra analysis */
2093 h->mb.b_skip_mc = 1;
2094 x264_analyse_update_cache( h, a );
2095 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2096 h->mb.b_skip_mc = 0;
2099 //FIXME not all the update_cache calls are needed
2100 h->mb.i_partition = D_16x16;
2102 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2104 h->mb.i_type = B_L0_L0;
2105 x264_analyse_update_cache( h, a );
2106 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2110 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2112 h->mb.i_type = B_L1_L1;
2113 x264_analyse_update_cache( h, a );
2114 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2118 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2120 h->mb.i_type = B_BI_BI;
2121 x264_analyse_update_cache( h, a );
2122 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2126 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2128 h->mb.i_type = B_8x8;
2129 h->mb.i_partition = D_8x8;
2130 x264_analyse_update_cache( h, a );
2131 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2132 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2136 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2138 h->mb.i_type = a->i_mb_type16x8;
2139 h->mb.i_partition = D_16x8;
2140 x264_analyse_update_cache( h, a );
2141 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2145 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2147 h->mb.i_type = a->i_mb_type8x16;
2148 h->mb.i_partition = D_8x16;
2149 x264_analyse_update_cache( h, a );
2150 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2154 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2156 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2159 if( IS_INTRA(h->mb.i_type) )
2162 switch( h->mb.i_partition )
2165 if( h->mb.i_type == B_BI_BI )
2166 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2169 for( i=0; i<2; i++ )
2170 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2171 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2174 for( i=0; i<2; i++ )
2175 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2176 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2179 for( i=0; i<4; i++ )
2180 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2181 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2186 static inline void x264_mb_analyse_transform( x264_t *h )
2188 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2190 int i_cost4, i_cost8;
2191 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2194 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2195 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2196 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2197 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2199 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2200 h->mb.b_skip_mc = 1;
2204 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2206 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2209 x264_analyse_update_cache( h, a );
2210 h->mb.b_transform_8x8 ^= 1;
2211 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2212 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2214 if( *i_rd >= i_rd8 )
2217 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2221 h->mb.b_transform_8x8 ^= 1;
2225 /* Rate-distortion optimal QP selection.
2226 * FIXME: More than half of the benefit of this function seems to be
2227 * in the way it improves the coding of chroma DC (by decimating or
2228 * finding a better way to code a single DC coefficient.)
2229 * There must be a more efficient way to get that portion of the benefit
2230 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2232 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2234 int bcost, cost, direction, failures, prevcost, origcost;
2235 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2236 int last_qp_tried = 0;
2237 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2239 /* If CBP is already zero, don't raise the quantizer any higher. */
2240 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2242 /* Without psy-RD, require monotonicity when moving quant away from previous
2243 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2244 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2245 * allow 2 failures when moving quant towards previous quant.
2246 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2247 int threshold = (!!h->mb.i_psy_rd);
2248 /* Raise the threshold for failures if we're moving towards the last QP. */
2249 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2250 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2252 h->mb.i_qp = orig_qp;
2254 prevcost = origcost;
2255 h->mb.i_qp += direction;
2256 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2258 if( h->mb.i_last_qp == h->mb.i_qp )
2260 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2261 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2262 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2264 /* We can't assume that the costs are monotonic over QPs.
2265 * Tie case-as-failure seems to give better results. */
2266 if( cost < prevcost )
2272 if( failures > threshold )
2274 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2276 h->mb.i_qp += direction;
2280 /* Always try the last block's QP. */
2281 if( !last_qp_tried )
2283 h->mb.i_qp = h->mb.i_last_qp;
2284 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2285 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2286 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2290 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2292 /* Check transform again; decision from before may no longer be optimal. */
2293 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2294 x264_mb_transform_8x8_allowed( h ) )
2296 h->mb.b_transform_8x8 ^= 1;
2297 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2299 h->mb.b_transform_8x8 ^= 1;
2303 /*****************************************************************************
2304 * x264_macroblock_analyse:
2305 *****************************************************************************/
2306 int x264_macroblock_analyse( x264_t *h )
2308 x264_mb_analysis_t analysis;
2309 int i_cost = COST_MAX;
2312 h->mb.i_qp = x264_ratecontrol_qp( h );
2313 if( h->param.rc.i_aq_mode )
2315 x264_adaptive_quant( h );
2316 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2317 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2318 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2319 h->mb.i_qp = h->mb.i_last_qp;
2322 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2324 /*--------------------------- Do the analysis ---------------------------*/
2325 if( h->sh.i_type == SLICE_TYPE_I )
2327 if( analysis.i_mbrd )
2328 x264_mb_cache_fenc_satd( h );
2329 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2330 if( analysis.i_mbrd )
2331 x264_intra_rd( h, &analysis, COST_MAX );
2333 i_cost = analysis.i_satd_i16x16;
2334 h->mb.i_type = I_16x16;
2335 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2336 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2337 if( analysis.i_satd_pcm < i_cost )
2338 h->mb.i_type = I_PCM;
2340 else if( analysis.i_mbrd >= 2 )
2341 x264_intra_rd_refine( h, &analysis );
2343 else if( h->sh.i_type == SLICE_TYPE_P )
2347 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2349 /* Fast P_SKIP detection */
2350 analysis.b_try_pskip = 0;
2351 if( h->param.analyse.b_fast_pskip )
2353 if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2354 // FIXME don't need to check this if the reference frame is done
2356 else if( h->param.analyse.i_subpel_refine >= 3 )
2357 analysis.b_try_pskip = 1;
2358 else if( h->mb.i_mb_type_left == P_SKIP ||
2359 h->mb.i_mb_type_top == P_SKIP ||
2360 h->mb.i_mb_type_topleft == P_SKIP ||
2361 h->mb.i_mb_type_topright == P_SKIP )
2362 b_skip = x264_macroblock_probe_pskip( h );
2365 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2369 h->mb.i_type = P_SKIP;
2370 h->mb.i_partition = D_16x16;
2371 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2375 const unsigned int flags = h->param.analyse.inter;
2379 int i_satd_inter, i_satd_intra;
2381 if( x264_mb_analyse_load_costs( h, &analysis ) )
2384 x264_mb_analyse_inter_p16x16( h, &analysis );
2386 if( h->mb.i_type == P_SKIP )
2389 if( flags & X264_ANALYSE_PSUB16x16 )
2391 if( h->param.analyse.b_mixed_references )
2392 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2394 x264_mb_analyse_inter_p8x8( h, &analysis );
2397 /* Select best inter mode */
2399 i_partition = D_16x16;
2400 i_cost = analysis.l0.me16x16.cost;
2402 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2403 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2406 i_partition = D_8x8;
2407 i_cost = analysis.l0.i_cost8x8;
2410 if( flags & X264_ANALYSE_PSUB8x8 )
2412 for( i = 0; i < 4; i++ )
2414 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2415 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2417 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2418 h->mb.i_sub_partition[i] = D_L0_4x4;
2420 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2421 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2422 h->mb.i_sub_partition[i], D_L0_8x4 );
2424 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2425 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2426 h->mb.i_sub_partition[i], D_L0_4x8 );
2428 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2430 x264_mb_cache_mv_p8x8( h, &analysis, i );
2432 analysis.l0.i_cost8x8 = i_cost;
2436 /* Now do 16x8/8x16 */
2437 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2438 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2439 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2441 x264_mb_analyse_inter_p16x8( h, &analysis );
2442 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2444 x264_mb_analyse_inter_p8x16( h, &analysis );
2445 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2448 h->mb.i_partition = i_partition;
2451 //FIXME mb_type costs?
2452 if( analysis.i_mbrd )
2456 else if( i_partition == D_16x16 )
2458 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2459 i_cost = analysis.l0.me16x16.cost;
2461 else if( i_partition == D_16x8 )
2463 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2464 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2465 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2467 else if( i_partition == D_8x16 )
2469 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2470 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2471 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2473 else if( i_partition == D_8x8 )
2477 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2479 switch( h->mb.i_sub_partition[i8x8] )
2482 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2483 i_cost += analysis.l0.me8x8[i8x8].cost;
2486 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2487 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2488 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2489 analysis.l0.me8x4[i8x8][1].cost;
2492 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2493 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2494 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2495 analysis.l0.me4x8[i8x8][1].cost;
2499 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2500 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2501 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2502 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2503 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2504 analysis.l0.me4x4[i8x8][1].cost +
2505 analysis.l0.me4x4[i8x8][2].cost +
2506 analysis.l0.me4x4[i8x8][3].cost;
2509 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2515 if( h->mb.b_chroma_me )
2517 x264_mb_analyse_intra_chroma( h, &analysis );
2518 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2519 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2520 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2521 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2524 x264_mb_analyse_intra( h, &analysis, i_cost );
2526 i_satd_inter = i_cost;
2527 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2528 analysis.i_satd_i8x8,
2529 analysis.i_satd_i4x4 );
2531 if( analysis.i_mbrd )
2533 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2535 i_partition = D_16x16;
2536 i_cost = analysis.l0.i_rd16x16;
2537 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2538 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2539 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2540 h->mb.i_type = i_type;
2541 h->mb.i_partition = i_partition;
2542 if( i_cost < COST_MAX )
2543 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2544 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2547 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2548 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2549 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2550 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2552 h->mb.i_type = i_type;
2554 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2556 if( IS_INTRA( h->mb.i_type ) )
2558 x264_intra_rd_refine( h, &analysis );
2560 else if( i_partition == D_16x16 )
2562 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2563 analysis.l0.me16x16.cost = analysis.l0.i_rd16x16;
2564 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2566 else if( i_partition == D_16x8 )
2568 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2569 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2570 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2571 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2572 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2573 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2575 else if( i_partition == D_8x16 )
2577 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2578 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2579 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2580 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2581 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2582 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2584 else if( i_partition == D_8x8 )
2587 x264_analyse_update_cache( h, &analysis );
2588 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2590 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2592 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2594 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2596 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2597 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2599 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2601 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2602 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2604 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2606 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2607 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2608 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2609 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2616 else if( h->sh.i_type == SLICE_TYPE_B )
2618 int i_bskip_cost = COST_MAX;
2621 if( analysis.i_mbrd )
2622 x264_mb_cache_fenc_satd( h );
2624 h->mb.i_type = B_SKIP;
2625 if( h->mb.b_direct_auto_write )
2627 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2628 for( i = 0; i < 2; i++ )
2631 h->sh.b_direct_spatial_mv_pred ^= 1;
2632 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2633 if( analysis.b_direct_available )
2638 b_skip = x264_macroblock_probe_bskip( h );
2640 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2647 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2649 if( analysis.b_direct_available )
2651 if( !h->mb.b_direct_auto_write )
2653 if( analysis.i_mbrd )
2655 i_bskip_cost = ssd_mb( h );
2656 /* 6 = minimum cavlc cost of a non-skipped MB */
2657 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2659 else if( !h->mb.b_direct_auto_write )
2661 /* Conditioning the probe on neighboring block types
2662 * doesn't seem to help speed or quality. */
2663 b_skip = x264_macroblock_probe_bskip( h );
2669 const unsigned int flags = h->param.analyse.inter;
2673 h->mb.b_skip_mc = 0;
2675 if( x264_mb_analyse_load_costs( h, &analysis ) )
2678 /* select best inter mode */
2679 /* direct must be first */
2680 if( analysis.b_direct_available )
2681 x264_mb_analyse_inter_direct( h, &analysis );
2683 x264_mb_analyse_inter_b16x16( h, &analysis );
2686 i_partition = D_16x16;
2687 i_cost = analysis.l0.me16x16.cost;
2688 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2689 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2690 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2692 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2694 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2695 if( i_bskip_cost < analysis.i_rd16x16direct &&
2696 i_bskip_cost < analysis.i_rd16x16bi &&
2697 i_bskip_cost < analysis.l0.i_rd16x16 &&
2698 i_bskip_cost < analysis.l1.i_rd16x16 )
2700 h->mb.i_type = B_SKIP;
2701 x264_analyse_update_cache( h, &analysis );
2706 if( flags & X264_ANALYSE_BSUB16x16 )
2708 x264_mb_analyse_inter_b8x8( h, &analysis );
2709 if( analysis.i_cost8x8bi < i_cost )
2712 i_partition = D_8x8;
2713 i_cost = analysis.i_cost8x8bi;
2715 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2716 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2718 x264_mb_analyse_inter_b16x8( h, &analysis );
2719 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2720 i_type, analysis.i_mb_type16x8,
2721 i_partition, D_16x8 );
2723 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2724 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2726 x264_mb_analyse_inter_b8x16( h, &analysis );
2727 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2728 i_type, analysis.i_mb_type8x16,
2729 i_partition, D_8x16 );
2734 if( analysis.i_mbrd )
2739 else if( i_partition == D_16x16 )
2741 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2742 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2743 if( i_type == B_L0_L0 )
2745 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2746 i_cost = analysis.l0.me16x16.cost
2747 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2749 else if( i_type == B_L1_L1 )
2751 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2752 i_cost = analysis.l1.me16x16.cost
2753 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2755 else if( i_type == B_BI_BI )
2757 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2758 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2761 else if( i_partition == D_16x8 )
2763 for( i=0; i<2; i++ )
2765 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2766 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2767 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2768 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2771 else if( i_partition == D_8x16 )
2773 for( i=0; i<2; i++ )
2775 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2776 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2777 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2778 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2781 else if( i_partition == D_8x8 )
2783 for( i=0; i<4; i++ )
2786 int i_part_cost_old;
2788 int i_part_type = h->mb.i_sub_partition[i];
2789 int b_bidir = (i_part_type == D_BI_8x8);
2791 if( i_part_type == D_DIRECT_8x8 )
2793 if( x264_mb_partition_listX_table[0][i_part_type] )
2795 m = &analysis.l0.me8x8[i];
2796 i_part_cost_old = m->cost;
2797 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2798 m->cost -= i_type_cost;
2799 x264_me_refine_qpel( h, m );
2801 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2803 if( x264_mb_partition_listX_table[1][i_part_type] )
2805 m = &analysis.l1.me8x8[i];
2806 i_part_cost_old = m->cost;
2807 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2808 m->cost -= i_type_cost;
2809 x264_me_refine_qpel( h, m );
2811 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2813 /* TODO: update mvp? */
2817 i_satd_inter = i_cost;
2819 if( analysis.i_mbrd )
2821 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2823 i_cost = i_bskip_cost;
2824 i_partition = D_16x16;
2825 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2826 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2827 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2828 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2829 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2830 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2831 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2833 h->mb.i_type = i_type;
2834 h->mb.i_partition = i_partition;
2837 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2839 if( analysis.i_mbrd )
2841 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2842 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2845 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2846 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2847 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2848 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2850 h->mb.i_type = i_type;
2851 h->mb.i_partition = i_partition;
2853 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2854 x264_intra_rd_refine( h, &analysis );
2855 if( h->mb.i_subpel_refine >= 5 )
2856 x264_refine_bidir( h, &analysis );
2858 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2860 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2861 x264_analyse_update_cache( h, &analysis );
2863 if( i_partition == D_16x16 )
2865 if( i_type == B_L0_L0 )
2867 analysis.l0.me16x16.cost = analysis.l0.i_rd16x16;
2868 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2870 else if( i_type == B_L1_L1 )
2872 analysis.l1.me16x16.cost = analysis.l1.i_rd16x16;
2873 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2875 else if( i_type == B_BI_BI )
2876 x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2878 else if( i_partition == D_16x8 )
2880 for( i = 0; i < 2; i++ )
2882 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2883 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2884 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2885 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2886 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2887 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2888 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2891 else if( i_partition == D_8x16 )
2893 for( i = 0; i < 2; i++ )
2895 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2896 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2897 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2898 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2899 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2900 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2901 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2904 else if( i_partition == D_8x8 )
2906 for( i = 0; i < 4; i++ )
2908 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2909 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2910 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2911 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2912 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2913 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2920 x264_analyse_update_cache( h, &analysis );
2922 if( !analysis.i_mbrd )
2923 x264_mb_analyse_transform( h );
2925 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2926 x264_mb_analyse_qp_rd( h, &analysis );
2928 h->mb.b_trellis = h->param.analyse.i_trellis;
2929 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2930 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2931 x264_psy_trellis_init( h, 0 );
2932 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2933 h->mb.i_skip_intra = 0;
2937 /*-------------------- Update MB from the analysis ----------------------*/
2938 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
2942 switch( h->mb.i_type )
2945 for( i = 0; i < 16; i++ )
2946 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
2948 x264_mb_analyse_intra_chroma( h, a );
2951 for( i = 0; i < 4; i++ )
2952 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
2954 x264_mb_analyse_intra_chroma( h, a );
2957 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2958 x264_mb_analyse_intra_chroma( h, a );
2965 switch( h->mb.i_partition )
2968 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
2969 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
2973 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
2974 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
2975 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
2976 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
2980 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
2981 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
2982 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
2983 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
2987 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
2993 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2994 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2995 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2996 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2997 for( i = 0; i < 4; i++ )
2998 x264_mb_cache_mv_p8x8( h, a, i );
3003 h->mb.i_partition = D_16x16;
3004 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3005 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3011 x264_mb_load_mv_direct8x8( h, 0 );
3012 x264_mb_load_mv_direct8x8( h, 1 );
3013 x264_mb_load_mv_direct8x8( h, 2 );
3014 x264_mb_load_mv_direct8x8( h, 3 );
3018 /* optimize: cache might not need to be rewritten */
3019 for( i = 0; i < 4; i++ )
3020 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3023 default: /* the rest of the B types */
3024 switch( h->mb.i_partition )
3027 switch( h->mb.i_type )
3030 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3031 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3033 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3034 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3035 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3038 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3039 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3040 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3042 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3043 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3046 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3047 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3049 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3050 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3055 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3056 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3059 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3060 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3063 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3069 if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3072 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3075 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3078 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3079 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3081 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3082 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3083 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3084 h->mb.cache.mv[l][x264_scan8[15]][0],
3085 h->mb.cache.mv[l][x264_scan8[15]][1] );
3086 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3087 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3088 fprintf(stderr, "completed: %d \n", completed );
3089 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3090 x264_mb_analyse_intra( h, a, COST_MAX );
3091 h->mb.i_type = I_16x16;
3092 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3093 x264_mb_analyse_intra_chroma( h, a );
3100 #include "slicetype.c"