1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "macroblock.h"
32 #include "ratecontrol.h"
41 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
45 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
46 ALIGNED_4( int16_t mvc[32][5][2] );
50 int i_cost4x4[4]; /* cost per 8x8 partition */
51 x264_me_t me4x4[4][4];
54 int i_cost8x4[4]; /* cost per 8x8 partition */
55 x264_me_t me8x4[4][2];
58 int i_cost4x8[4]; /* cost per 8x8 partition */
59 x264_me_t me4x8[4][2];
69 } x264_mb_analysis_list_t;
73 /* conduct the analysis using this lamda and QP */
78 uint16_t *p_cost_ref[2];
83 /* Take some shortcuts in intra search if intra is deemed unlikely */
85 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
90 int i_satd_i16x16_dir[7];
95 int i_satd_i8x8_dir[12][4];
104 int i_satd_i8x8chroma;
105 int i_satd_i8x8chroma_dir[7];
106 int i_predict8x8chroma;
108 /* II: Inter part P/B frame */
109 x264_mb_analysis_list_t l0;
110 x264_mb_analysis_list_t l1;
112 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
113 int i_cost16x16direct;
115 int i_cost8x8direct[4];
116 int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
117 int i_cost_est16x8[2]; /* Per-partition estimated cost */
118 int i_cost_est8x16[2];
127 int i_mb_partition16x8[2]; /* mb_partition_e */
128 int i_mb_partition8x16[2];
129 int i_mb_type16x8; /* mb_class_e */
132 int b_direct_available;
134 } x264_mb_analysis_t;
136 /* lambda = pow(2,qp/6-2) */
137 const uint8_t x264_lambda_tab[52] = {
138 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
139 1, 1, 1, 1, /* 8-11 */
140 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
141 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
142 6, 7, 8, 9,10,11,13,14, /* 28-35 */
143 16,18,20,23,25,29,32,36, /* 36-43 */
144 40,45,51,57,64,72,81,91 /* 44-51 */
147 /* lambda2 = pow(lambda,2) * .9 * 256 */
148 const int x264_lambda2_tab[52] = {
149 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
150 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
151 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
152 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
153 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
154 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
155 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
158 const uint8_t x264_exp2_lut[64] = {
159 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
160 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
161 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
162 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
165 const float x264_log2_lut[128] = {
166 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
167 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
168 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
169 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
170 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
171 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
172 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
173 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
174 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
175 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
176 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
177 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
178 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
179 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
180 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
181 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
184 /* Avoid an int/float conversion. */
185 const float x264_log2_lz_lut[32] = {
186 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
189 // should the intra and inter lambdas be different?
190 // I'm just matching the behaviour of deadzone quant.
191 static const int x264_trellis_lambda2_tab[2][52] = {
192 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
193 { 46, 58, 73, 92, 117, 147,
194 185, 233, 294, 370, 466, 587,
195 740, 932, 1174, 1480, 1864, 2349,
196 2959, 3728, 4697, 5918, 7457, 9395,
197 11837, 14914, 18790, 23674, 29828, 37581,
198 47349, 59656, 75163, 94699, 119313, 150326,
199 189399, 238627, 300652, 378798, 477255, 601304,
200 757596, 954511, 1202608, 1515192, 1909022, 2405217,
201 3030384, 3818045, 4810435, 6060769 },
202 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
203 { 27, 34, 43, 54, 68, 86,
204 108, 136, 172, 216, 273, 343,
205 433, 545, 687, 865, 1090, 1374,
206 1731, 2180, 2747, 3461, 4361, 5494,
207 6922, 8721, 10988, 13844, 17442, 21976,
208 27688, 34885, 43953, 55377, 69771, 87906,
209 110755, 139543, 175813, 221511, 279087, 351627,
210 443023, 558174, 703255, 886046, 1116348, 1406511,
211 1772093, 2232697, 2813022, 3544186 }
214 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
215 16, 20, 25, 32, 40, 50,
216 64, 80, 101, 128, 161, 203,
217 256, 322, 406, 512, 645, 812,
218 1024, 1290, 1625, 2048, 2580, 3250,
219 4096, 5160, 6501, 8192, 10321, 13003,
220 16384, 20642, 26007, 32768, 41285, 52015,
224 /* TODO: calculate CABAC costs */
225 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
226 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
228 static const uint8_t i_mb_b16x8_cost_table[17] = {
229 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
231 static const uint8_t i_sub_mb_b_cost_table[13] = {
232 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
234 static const uint8_t i_sub_mb_p_cost_table[4] = {
238 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
240 static uint16_t x264_cost_ref[92][3][33];
241 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
243 int x264_analyse_init_costs( x264_t *h, int qp )
245 int lambda = x264_lambda_tab[qp];
246 if( h->cost_mv[lambda] )
248 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
249 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
250 h->cost_mv[lambda] += 2*4*2048;
251 for( int i = 0; i <= 2*4*2048; i++ )
253 h->cost_mv[lambda][-i] =
254 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
256 x264_pthread_mutex_lock( &cost_ref_mutex );
257 for( int i = 0; i < 3; i++ )
258 for( int j = 0; j < 33; j++ )
259 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
260 x264_pthread_mutex_unlock( &cost_ref_mutex );
261 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
263 for( int j = 0; j < 4; j++ )
265 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
266 h->cost_mv_fpel[lambda][j] += 2*2048;
267 for( int i = -2*2048; i < 2*2048; i++ )
268 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
276 void x264_analyse_free_costs( x264_t *h )
278 for( int i = 0; i < 92; i++ )
281 x264_free( h->cost_mv[i] - 2*4*2048 );
282 if( h->cost_mv_fpel[i][0] )
283 for( int j = 0; j < 4; j++ )
284 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
288 void x264_analyse_weight_frame( x264_t *h, int end )
290 for( int j = 0; j < h->i_ref0; j++ )
292 if( h->sh.weight[j][0].weightfn )
294 x264_frame_t *frame = h->fref0[j];
295 int width = frame->i_width[0] + 2*PADH;
296 int i_padv = PADV << h->param.b_interlaced;
298 pixel *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
299 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
300 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
301 h->fenc->i_lines_weighted += height;
303 for( int k = j; k < h->i_ref0; k++ )
304 if( h->sh.weight[k][0].weightfn )
306 pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
307 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
308 src + offset, frame->i_stride[0],
309 width, height, &h->sh.weight[k][0] );
316 /* initialize an array of lambda*nbits for all possible mvs */
317 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
319 a->p_cost_mv = h->cost_mv[a->i_lambda];
320 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
321 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
324 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
326 /* conduct the analysis using this lamda and QP */
327 a->i_qp = h->mb.i_qp = i_qp;
328 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
330 a->i_lambda = x264_lambda_tab[i_qp];
331 a->i_lambda2 = x264_lambda2_tab[i_qp];
333 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
334 if( h->param.analyse.i_trellis )
336 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
337 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
338 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
339 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
341 h->mb.i_psy_rd_lambda = a->i_lambda;
342 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
343 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
346 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
348 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
350 /* mbrd == 1 -> RD mode decision */
351 /* mbrd == 2 -> RD refinement */
352 /* mbrd == 3 -> QPRD */
353 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
355 x264_mb_analyse_init_qp( h, a, i_qp );
357 h->mb.b_transform_8x8 = 0;
358 h->mb.b_noise_reduction = 0;
364 a->i_satd_i8x8chroma = COST_MAX;
366 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
367 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
371 h->mb.b_lossless ? 0 :
373 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
375 /* II: Inter part P/B frame */
376 if( h->sh.i_type != SLICE_TYPE_I )
378 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
379 // limit motion search to a slightly smaller range than the theoretical limit,
380 // since the search may go a few iterations past its given range
381 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
383 /* Calculate max allowed MV range */
384 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
385 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
386 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
387 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
388 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
389 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
391 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
392 int max_mv = max_x - 4*16*h->mb.i_mb_x;
393 /* If we're left of the refresh bar, don't reference right of it. */
394 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
395 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
397 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
398 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
399 if( h->mb.i_mb_x == 0 )
401 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
402 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
403 int thread_mvy_range = i_fmv_range;
405 if( h->i_thread_frames > 1 )
407 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
408 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
409 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
411 x264_frame_t **fref = i ? h->fref1 : h->fref0;
412 int i_ref = i ? h->i_ref1 : h->i_ref0;
413 for( int j = 0; j < i_ref; j++ )
415 x264_frame_cond_wait( fref[j]->orig, thresh );
416 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
420 if( h->param.b_deterministic )
421 thread_mvy_range = h->param.analyse.i_mv_range_thread;
422 if( h->mb.b_interlaced )
423 thread_mvy_range >>= 1;
425 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
428 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
429 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
430 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
431 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
432 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
433 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
434 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
442 a->l0.i_cost8x16 = COST_MAX;
443 if( h->sh.i_type == SLICE_TYPE_B )
448 a->i_cost8x8direct[0] =
449 a->i_cost8x8direct[1] =
450 a->i_cost8x8direct[2] =
451 a->i_cost8x8direct[3] =
460 a->i_cost16x16direct =
463 a->i_cost8x16bi = COST_MAX;
465 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
466 for( int i = 0; i < 4; i++ )
470 a->l0.i_cost4x8[i] = COST_MAX;
473 /* Fast intra decision */
474 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
476 /* Always run in fast-intra mode for subme < 3 */
477 if( h->mb.i_subpel_refine > 2 &&
478 ( IS_INTRA( h->mb.i_mb_type_left ) ||
479 IS_INTRA( h->mb.i_mb_type_top ) ||
480 IS_INTRA( h->mb.i_mb_type_topleft ) ||
481 IS_INTRA( h->mb.i_mb_type_topright ) ||
482 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] )) ||
483 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
484 { /* intra is likely */ }
491 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
492 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
494 a->b_force_intra = 1;
498 a->b_force_intra = 0;
502 /* Prediction modes allowed for various combinations of neighbors. */
503 /* Terminated by a -1. */
504 /* In order, no neighbors, left, top, top/left, top/left/topleft */
505 static const int8_t i16x16_mode_available[5][5] =
507 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
508 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
509 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
510 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
511 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
514 static const int8_t i8x8chroma_mode_available[5][5] =
516 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
517 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
518 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
519 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
520 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
523 static const int8_t i4x4_mode_available[5][10] =
525 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
526 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
527 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
528 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
529 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
532 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
534 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
535 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
538 static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
540 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
541 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
544 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour )
546 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
547 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
550 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
551 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
553 ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
555 if( do_both_dct || h->mb.b_transform_8x8 )
556 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
557 if( do_both_dct || !h->mb.b_transform_8x8 )
558 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
561 /* Reset fenc satd scores cache for psy RD */
562 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
564 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
565 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
566 if( !h->mb.i_psy_rd )
568 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
569 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
571 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
574 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
576 if( a->i_satd_i8x8chroma < COST_MAX )
579 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
581 /* 8x8 prediction selection for chroma */
582 if( predict_mode[3] >= 0 && !h->mb.b_lossless )
584 int satdu[4], satdv[4];
585 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
586 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
587 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
588 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
589 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
590 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
592 for( ; *predict_mode >= 0; predict_mode++ )
594 int i_mode = *predict_mode;
595 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
597 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
598 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
603 for( ; *predict_mode >= 0; predict_mode++ )
606 int i_mode = *predict_mode;
608 /* we do the prediction */
609 if( h->mb.b_lossless )
610 x264_predict_lossless_8x8_chroma( h, i_mode );
613 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
614 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
617 /* we calculate the cost */
618 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
619 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
620 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
622 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
623 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
627 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
630 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
632 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
633 pixel *p_src = h->mb.pic.p_fenc[0];
634 pixel *p_dst = h->mb.pic.p_fdec[0];
635 static const int8_t intra_analysis_shortcut[2][2][5] =
636 {{{I_PRED_4x4_HU, -1},
637 {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1}},
638 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1},
639 {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}};
642 int lambda = a->i_lambda;
644 /*---------------- Try all mode and calculate their score ---------------*/
646 /* 16x16 prediction selection */
647 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
649 /* Not heavily tuned */
650 static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
651 int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
653 if( !h->mb.b_lossless && predict_mode[3] >= 0 )
655 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
656 a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
657 a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
658 a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
659 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
660 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
661 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
663 /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
664 if( a->i_satd_i16x16 <= i16x16_thresh )
666 h->predict_16x16[I_PRED_16x16_P]( p_dst );
667 a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
668 a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
669 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
674 for( ; *predict_mode >= 0; predict_mode++ )
677 int i_mode = *predict_mode;
679 if( h->mb.b_lossless )
680 x264_predict_lossless_16x16( h, i_mode );
682 h->predict_16x16[i_mode]( p_dst );
684 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
685 lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
686 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
687 a->i_satd_i16x16_dir[i_mode] = i_satd;
691 if( h->sh.i_type == SLICE_TYPE_B )
692 /* cavlc mb type prefix */
693 a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
695 if( a->i_satd_i16x16 > i16x16_thresh )
698 /* 8x8 prediction selection */
699 if( flags & X264_ANALYSE_I8x8 )
701 ALIGNED_ARRAY_16( pixel, edge,[33] );
702 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
703 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
705 // FIXME some bias like in i4x4?
706 int i_cost = lambda * 4; /* base predmode costs */
707 h->mb.i_cbp_luma = 0;
709 if( h->sh.i_type == SLICE_TYPE_B )
710 i_cost += lambda * i_mb_b_cost_table[I_8x8];
712 for( idx = 0;; idx++ )
716 pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
717 pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
718 int i_best = COST_MAX;
719 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
721 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
722 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
724 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
727 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
728 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
729 satd[i_pred_mode] -= 3 * lambda;
730 for( int i = 2; i >= 0; i-- )
733 a->i_satd_i8x8_dir[i][idx] = cost + 4 * lambda;
734 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
737 /* Take analysis shortcuts: don't analyse modes that are too
738 * far away direction-wise from the favored mode. */
739 if( a->i_mbrd < 1 + a->b_fast_intra )
740 predict_mode = intra_analysis_shortcut[predict_mode[8] >= 0][favor_vertical];
745 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
748 int i_mode = *predict_mode;
750 if( h->mb.b_lossless )
751 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
753 h->predict_8x8[i_mode]( p_dst_by, edge );
755 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
756 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
757 i_satd -= 3 * lambda;
759 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
760 a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * lambda;
762 i_cost += i_best + 3 * lambda;
764 if( idx == 3 || i_cost > i_satd_thresh )
767 /* we need to encode this block now (for next ones) */
768 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
769 x264_mb_encode_i8x8( h, idx, a->i_qp );
771 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
776 a->i_satd_i8x8 = i_cost;
777 if( h->mb.i_skip_intra )
779 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
780 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
781 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
782 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
783 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
784 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
785 if( h->mb.i_skip_intra == 2 )
786 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
791 static const uint16_t cost_div_fix8[3] = {1024,512,341};
792 a->i_satd_i8x8 = COST_MAX;
793 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
795 /* Not heavily tuned */
796 static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
797 if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
801 /* 4x4 prediction selection */
802 if( flags & X264_ANALYSE_I4x4 )
804 int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
805 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
806 h->mb.i_cbp_luma = 0;
809 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
811 if( h->sh.i_type == SLICE_TYPE_B )
812 i_cost += lambda * i_mb_b_cost_table[I_4x4];
814 for( idx = 0;; idx++ )
816 pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
817 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
818 int i_best = COST_MAX;
819 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
821 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
823 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
824 /* emulate missing topright samples */
825 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
827 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
830 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
831 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
832 satd[i_pred_mode] -= 3 * lambda;
833 for( int i = 2; i >= 0; i-- )
834 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
836 /* Take analysis shortcuts: don't analyse modes that are too
837 * far away direction-wise from the favored mode. */
838 if( a->i_mbrd < 1 + a->b_fast_intra )
839 predict_mode = intra_analysis_shortcut[predict_mode[8] >= 0][favor_vertical];
846 for( ; *predict_mode >= 0; predict_mode++ )
849 int i_mode = *predict_mode;
851 if( h->mb.b_lossless )
852 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
854 h->predict_4x4[i_mode]( p_dst_by );
856 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
857 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
859 i_satd -= lambda * 3;
863 a->i_predict4x4[idx] = i_mode;
868 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
871 i_cost += i_best + 3 * lambda;
873 if( i_cost > i_satd_thresh || idx == 15 )
876 /* we need to encode this block now (for next ones) */
877 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
878 x264_mb_encode_i4x4( h, idx, a->i_qp );
880 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
884 a->i_satd_i4x4 = i_cost;
885 if( h->mb.i_skip_intra )
887 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
888 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
889 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
890 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
891 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
892 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
893 if( h->mb.i_skip_intra == 2 )
894 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
898 a->i_satd_i4x4 = COST_MAX;
902 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
904 if( a->i_satd_i16x16 <= i_satd_thresh )
906 h->mb.i_type = I_16x16;
907 x264_analyse_update_cache( h, a );
908 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
911 a->i_satd_i16x16 = COST_MAX;
913 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
915 h->mb.i_type = I_4x4;
916 x264_analyse_update_cache( h, a );
917 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
920 a->i_satd_i4x4 = COST_MAX;
922 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
924 h->mb.i_type = I_8x8;
925 x264_analyse_update_cache( h, a );
926 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
927 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
930 a->i_satd_i8x8 = COST_MAX;
933 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
935 pixel *p_dst = h->mb.pic.p_fdec[0];
936 uint64_t i_satd, i_best;
937 h->mb.i_skip_intra = 0;
939 if( h->mb.i_type == I_16x16 )
941 int old_pred_mode = a->i_predict16x16;
942 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
943 int i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
944 i_best = a->i_satd_i16x16;
945 for( ; *predict_mode >= 0; predict_mode++ )
947 int i_mode = *predict_mode;
948 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
950 h->mb.i_intra16x16_pred_mode = i_mode;
951 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
952 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
956 /* RD selection for chroma prediction */
957 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
958 if( predict_mode[1] >= 0 )
960 int8_t predict_mode_sorted[4];
962 int i_thresh = a->i_satd_i8x8chroma * 5/4;
964 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
966 int i_mode = *predict_mode;
967 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
968 predict_mode_sorted[i_max++] = i_mode;
973 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
974 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
975 /* the previous thing encoded was x264_intra_rd(), so the pixels and
976 * coefs for the current chroma mode are still around, so we only
977 * have to recount the bits. */
978 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
979 for( int i = 0; i < i_max; i++ )
981 int i_mode = predict_mode_sorted[i];
982 if( h->mb.b_lossless )
983 x264_predict_lossless_8x8_chroma( h, i_mode );
986 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
987 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
989 /* if we've already found a mode that needs no residual, then
990 * probably any mode with a residual will be worse.
991 * so avoid dct on the remaining modes to improve speed. */
992 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
993 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
995 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
996 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1000 if( h->mb.i_type == I_4x4 )
1002 pixel4 pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1004 for( int idx = 0; idx < 16; idx++ )
1006 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1007 i_best = COST_MAX64;
1009 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
1011 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1012 /* emulate missing topright samples */
1013 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1015 for( ; *predict_mode >= 0; predict_mode++ )
1017 int i_mode = *predict_mode;
1018 if( h->mb.b_lossless )
1019 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1021 h->predict_4x4[i_mode]( p_dst_by );
1022 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1024 if( i_best > i_satd )
1026 a->i_predict4x4[idx] = i_mode;
1028 pels[0] = MPIXEL_X4( p_dst_by+0*FDEC_STRIDE );
1029 pels[1] = MPIXEL_X4( p_dst_by+1*FDEC_STRIDE );
1030 pels[2] = MPIXEL_X4( p_dst_by+2*FDEC_STRIDE );
1031 pels[3] = MPIXEL_X4( p_dst_by+3*FDEC_STRIDE );
1032 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1036 MPIXEL_X4( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1037 MPIXEL_X4( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1038 MPIXEL_X4( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1039 MPIXEL_X4( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1040 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1042 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1045 else if( h->mb.i_type == I_8x8 )
1047 ALIGNED_ARRAY_16( pixel, edge,[33] );
1048 for( int idx = 0; idx < 4; idx++ )
1050 pixel4 pels_h[2] = {0};
1051 pixel pels_v[7] = {0};
1052 uint16_t i_nnz[2] = {0}; //shut up gcc
1054 int cbp_luma_new = 0;
1055 int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1057 i_best = COST_MAX64;
1060 int s8 = X264_SCAN8_0 + 2*x + 16*y;
1062 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1063 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1064 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1066 for( ; *predict_mode >= 0; predict_mode++ )
1068 int i_mode = *predict_mode;
1069 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1072 if( h->mb.b_lossless )
1073 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1075 h->predict_8x8[i_mode]( p_dst_by, edge );
1076 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1077 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1079 if( i_best > i_satd )
1081 a->i_predict8x8[idx] = i_mode;
1082 cbp_luma_new = h->mb.i_cbp_luma;
1085 pels_h[0] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 );
1086 pels_h[1] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 );
1088 for( int j = 0; j < 7; j++ )
1089 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1090 i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
1091 i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
1094 a->i_cbp_i8x8_luma = cbp_luma_new;
1095 MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 ) = pels_h[0];
1096 MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 ) = pels_h[1];
1098 for( int j = 0; j < 7; j++ )
1099 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1100 M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
1101 M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
1103 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1108 #define LOAD_FENC( m, src, xoff, yoff) \
1109 (m)->p_cost_mv = a->p_cost_mv; \
1110 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1111 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1112 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1113 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1114 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1116 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1117 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1118 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1119 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1120 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1121 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1122 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1123 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1124 (m)->weight = weight_none; \
1127 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1128 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1129 (m)->weight = h->sh.weight[i_ref];
1131 #define REF_COST(list, ref) \
1132 (a->p_cost_ref[list][ref])
1134 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1138 ALIGNED_4( int16_t mvc[8][2] );
1139 int i_halfpel_thresh = INT_MAX;
1140 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1142 /* 16x16 Search on all ref frame */
1143 m.i_pixel = PIXEL_16x16;
1144 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1146 a->l0.me16x16.cost = INT_MAX;
1147 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1149 m.i_ref_cost = REF_COST( 0, i_ref );
1150 i_halfpel_thresh -= m.i_ref_cost;
1152 /* search with ref */
1153 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1154 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1156 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1158 if( h->mb.ref_blind_dupe == i_ref )
1160 CP32( m.mv, a->l0.mvc[0][0] );
1161 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1165 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1166 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1169 /* save mv for predicting neighbors */
1170 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1171 CP32( a->l0.mvc[i_ref][0], m.mv );
1173 /* early termination
1174 * SSD threshold would probably be better than SATD */
1177 && m.cost-m.cost_mv < 300*a->i_lambda
1178 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1179 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1180 && x264_macroblock_probe_pskip( h ) )
1182 h->mb.i_type = P_SKIP;
1183 x264_analyse_update_cache( h, a );
1184 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1188 m.cost += m.i_ref_cost;
1189 i_halfpel_thresh += m.i_ref_cost;
1191 if( m.cost < a->l0.me16x16.cost )
1192 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1195 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1196 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1198 h->mb.i_type = P_L0;
1201 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1202 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1204 h->mb.i_partition = D_16x16;
1205 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1206 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1207 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1208 h->mb.i_type = P_SKIP;
1213 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1216 pixel **p_fenc = h->mb.pic.p_fenc;
1217 int i_maxref = h->mb.pic.i_fref[0]-1;
1219 h->mb.i_partition = D_8x8;
1221 #define CHECK_NEIGHBOUR(i)\
1223 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1224 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1228 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1229 * than those used by the neighbors */
1230 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1231 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1234 CHECK_NEIGHBOUR( -8 - 1 );
1235 CHECK_NEIGHBOUR( -8 + 0 );
1236 CHECK_NEIGHBOUR( -8 + 2 );
1237 CHECK_NEIGHBOUR( -8 + 4 );
1238 CHECK_NEIGHBOUR( 0 - 1 );
1239 CHECK_NEIGHBOUR( 2*8 - 1 );
1241 #undef CHECK_NEIGHBOUR
1243 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1244 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1246 for( int i = 0; i < 4; i++ )
1248 x264_me_t *l0m = &a->l0.me8x8[i];
1252 m.i_pixel = PIXEL_8x8;
1254 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1255 l0m->cost = INT_MAX;
1256 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1258 m.i_ref_cost = REF_COST( 0, i_ref );
1260 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1261 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1263 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1264 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1265 if( h->mb.ref_blind_dupe == i_ref )
1267 CP32( m.mv, a->l0.mvc[0][i+1] );
1268 x264_me_refine_qpel_refdupe( h, &m, NULL );
1271 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1273 m.cost += m.i_ref_cost;
1275 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1277 if( m.cost < l0m->cost )
1278 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1279 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1280 i_ref = h->mb.ref_blind_dupe;
1284 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1285 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1287 a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1289 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1290 are effectively zero. */
1291 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1292 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1295 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1296 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1297 /* P_8x8 ref0 has no ref cost */
1298 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1299 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1300 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1301 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1302 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1305 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1307 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1308 * reference frame flags. Thus, if we're not doing mixedrefs, just
1309 * don't bother analysing the dupes. */
1310 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1311 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1312 pixel **p_fenc = h->mb.pic.p_fenc;
1314 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1316 /* XXX Needed for x264_mb_predict_mv */
1317 h->mb.i_partition = D_8x8;
1320 CP32( mvc[0], a->l0.me16x16.mv );
1322 for( int i = 0; i < 4; i++ )
1324 x264_me_t *m = &a->l0.me8x8[i];
1328 m->i_pixel = PIXEL_8x8;
1329 m->i_ref_cost = i_ref_cost;
1331 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1332 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1333 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1335 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1336 x264_me_search( h, m, mvc, i_mvc );
1338 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1340 CP32( mvc[i_mvc], m->mv );
1343 a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1346 m->cost += i_ref_cost;
1347 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1348 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1351 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1352 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1353 /* theoretically this should include 4*ref_cost,
1354 * but 3 seems a better approximation of cabac. */
1355 if( h->param.b_cabac )
1356 a->l0.i_cost8x8 -= i_ref_cost;
1357 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1358 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1361 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1364 pixel **p_fenc = h->mb.pic.p_fenc;
1365 ALIGNED_4( int16_t mvc[3][2] );
1367 /* XXX Needed for x264_mb_predict_mv */
1368 h->mb.i_partition = D_16x8;
1370 for( int i = 0; i < 2; i++ )
1372 x264_me_t *l0m = &a->l0.me16x8[i];
1373 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1374 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1375 const int ref8[2] = { minref, maxref };
1376 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1378 m.i_pixel = PIXEL_16x8;
1380 LOAD_FENC( &m, p_fenc, 0, 8*i );
1381 l0m->cost = INT_MAX;
1382 for( int j = 0; j < i_ref8s; j++ )
1384 const int i_ref = ref8[j];
1385 m.i_ref_cost = REF_COST( 0, i_ref );
1387 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1388 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1389 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1390 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1392 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1393 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1395 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1396 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1397 /* We can only take this shortcut if the first search was performed on ref0. */
1398 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1400 /* We can just leave the MV from the previous ref search. */
1401 x264_me_refine_qpel_refdupe( h, &m, NULL );
1404 x264_me_search( h, &m, mvc, 3 );
1406 m.cost += m.i_ref_cost;
1408 if( m.cost < l0m->cost )
1409 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1412 /* Early termination based on the current SATD score of partition[0]
1413 plus the estimated SATD score of partition[1] */
1414 if( !i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1416 a->l0.i_cost16x8 = COST_MAX;
1420 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1421 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1424 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1427 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1430 pixel **p_fenc = h->mb.pic.p_fenc;
1431 ALIGNED_4( int16_t mvc[3][2] );
1433 /* XXX Needed for x264_mb_predict_mv */
1434 h->mb.i_partition = D_8x16;
1436 for( int i = 0; i < 2; i++ )
1438 x264_me_t *l0m = &a->l0.me8x16[i];
1439 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1440 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1441 const int ref8[2] = { minref, maxref };
1442 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1444 m.i_pixel = PIXEL_8x16;
1446 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1447 l0m->cost = INT_MAX;
1448 for( int j = 0; j < i_ref8s; j++ )
1450 const int i_ref = ref8[j];
1451 m.i_ref_cost = REF_COST( 0, i_ref );
1453 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1454 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1455 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1457 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1458 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1460 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1461 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1462 /* We can only take this shortcut if the first search was performed on ref0. */
1463 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1465 /* We can just leave the MV from the previous ref search. */
1466 x264_me_refine_qpel_refdupe( h, &m, NULL );
1469 x264_me_search( h, &m, mvc, 3 );
1471 m.cost += m.i_ref_cost;
1473 if( m.cost < l0m->cost )
1474 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1477 /* Early termination based on the current SATD score of partition[0]
1478 plus the estimated SATD score of partition[1] */
1479 if( !i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1481 a->l0.i_cost8x16 = COST_MAX;
1485 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1486 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1489 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1492 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1494 ALIGNED_ARRAY_8( pixel, pix1,[16*8] );
1495 pixel *pix2 = pix1+8;
1496 const int i_stride = h->mb.pic.i_stride[1];
1497 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1498 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1499 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1500 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1501 x264_weight_t *weight = h->sh.weight[i_ref];
1503 #define CHROMA4x4MC( width, height, me, x, y ) \
1504 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1505 if( weight[1].weightfn ) \
1506 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1507 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1508 if( weight[2].weightfn ) \
1509 weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1512 if( size == PIXEL_4x4 )
1514 x264_me_t *m = a->l0.me4x4[i8x8];
1515 CHROMA4x4MC( 2,2, m[0], 0,0 );
1516 CHROMA4x4MC( 2,2, m[1], 2,0 );
1517 CHROMA4x4MC( 2,2, m[2], 0,2 );
1518 CHROMA4x4MC( 2,2, m[3], 2,2 );
1520 else if( size == PIXEL_8x4 )
1522 x264_me_t *m = a->l0.me8x4[i8x8];
1523 CHROMA4x4MC( 4,2, m[0], 0,0 );
1524 CHROMA4x4MC( 4,2, m[1], 0,2 );
1528 x264_me_t *m = a->l0.me4x8[i8x8];
1529 CHROMA4x4MC( 2,4, m[0], 0,0 );
1530 CHROMA4x4MC( 2,4, m[1], 2,0 );
1533 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1534 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1537 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1539 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1540 pixel **p_fenc = h->mb.pic.p_fenc;
1541 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1543 /* XXX Needed for x264_mb_predict_mv */
1544 h->mb.i_partition = D_8x8;
1546 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1548 const int idx = 4*i8x8 + i4x4;
1549 const int x4 = block_idx_x[idx];
1550 const int y4 = block_idx_y[idx];
1551 const int i_mvc = (i4x4 == 0);
1553 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1555 m->i_pixel = PIXEL_4x4;
1557 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1558 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1559 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1561 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1562 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1564 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1566 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1567 a->l0.me4x4[i8x8][1].cost +
1568 a->l0.me4x4[i8x8][2].cost +
1569 a->l0.me4x4[i8x8][3].cost +
1570 REF_COST( 0, i_ref ) +
1571 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1572 if( h->mb.b_chroma_me )
1573 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1576 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1578 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1579 pixel **p_fenc = h->mb.pic.p_fenc;
1580 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1582 /* XXX Needed for x264_mb_predict_mv */
1583 h->mb.i_partition = D_8x8;
1585 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1587 const int idx = 4*i8x8 + 2*i8x4;
1588 const int x4 = block_idx_x[idx];
1589 const int y4 = block_idx_y[idx];
1590 const int i_mvc = (i8x4 == 0);
1592 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1594 m->i_pixel = PIXEL_8x4;
1596 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1597 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1598 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1600 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1601 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1603 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1605 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1606 REF_COST( 0, i_ref ) +
1607 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1608 if( h->mb.b_chroma_me )
1609 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1612 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1614 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1615 pixel **p_fenc = h->mb.pic.p_fenc;
1616 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1618 /* XXX Needed for x264_mb_predict_mv */
1619 h->mb.i_partition = D_8x8;
1621 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1623 const int idx = 4*i8x8 + i4x8;
1624 const int x4 = block_idx_x[idx];
1625 const int y4 = block_idx_y[idx];
1626 const int i_mvc = (i4x8 == 0);
1628 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1630 m->i_pixel = PIXEL_4x8;
1632 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1633 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1634 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1636 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1637 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1639 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1641 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1642 REF_COST( 0, i_ref ) +
1643 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1644 if( h->mb.b_chroma_me )
1645 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1648 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1650 /* Assumes that fdec still contains the results of
1651 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1653 pixel *p_fenc = h->mb.pic.p_fenc[0];
1654 pixel *p_fdec = h->mb.pic.p_fdec[0];
1656 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1657 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1658 for( int i = 0; i < 4; i++ )
1660 const int x = (i&1)*8;
1661 const int y = (i>>1)*8;
1662 a->i_cost16x16direct +=
1663 a->i_cost8x8direct[i] =
1664 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1667 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1670 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
1673 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1675 ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
1676 ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
1678 int stride0 = 16, stride1 = 16;
1680 ALIGNED_4( int16_t mvc[9][2] );
1681 int try_skip = a->b_try_skip;
1682 int list1_skipped = 0;
1683 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1684 int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL,
1685 h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL};
1688 m.i_pixel = PIXEL_16x16;
1690 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1692 /* 16x16 Search on list 0 and list 1 */
1693 a->l0.me16x16.cost = INT_MAX;
1694 a->l1.me16x16.cost = INT_MAX;
1695 for( int l = 1; l >= 0; )
1697 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1699 /* This loop is extremely munged in order to facilitate the following order of operations,
1700 * necessary for an efficient fast skip.
1701 * 1. Search list1 ref0.
1702 * 2. Search list0 ref0.
1704 * 4. Search the rest of list0.
1705 * 5. Go back and finish list1.
1707 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1709 if( try_skip && l == 1 && i_ref > 0 )
1715 m.i_ref_cost = REF_COST( l, i_ref );
1717 /* search with ref */
1718 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1719 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1720 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1721 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
1724 m.cost += m.i_ref_cost;
1726 if( m.cost < lX->me16x16.cost )
1727 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1729 /* save mv for predicting neighbors */
1730 CP32( lX->mvc[i_ref][0], m.mv );
1731 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1733 /* Fast skip detection. */
1734 if( i_ref == 0 && try_skip )
1736 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
1737 abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
1743 /* We already tested skip */
1744 h->mb.i_type = B_SKIP;
1745 x264_analyse_update_cache( h, a );
1750 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
1752 if( list1_skipped && l == 0 )
1758 /* get cost of BI mode */
1759 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1760 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1761 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1762 src0 = h->mc.get_ref( pix0, &stride0,
1763 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1764 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1765 src1 = h->mc.get_ref( pix1, &stride1,
1766 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1767 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1769 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1771 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1773 + a->l0.bi16x16.cost_mv
1774 + a->l1.bi16x16.cost_mv;
1776 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1777 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1779 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1780 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1781 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1782 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1783 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1784 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1785 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1786 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1787 + ref_costs + l0_mv_cost + l1_mv_cost;
1788 if( cost00 < a->i_cost16x16bi )
1790 M32( a->l0.bi16x16.mv ) = 0;
1791 M32( a->l1.bi16x16.mv ) = 0;
1792 a->l0.bi16x16.cost_mv = l0_mv_cost;
1793 a->l1.bi16x16.cost_mv = l1_mv_cost;
1794 a->i_cost16x16bi = cost00;
1799 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1800 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1801 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1804 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1809 switch( h->mb.i_sub_partition[i] )
1812 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1815 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1816 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1819 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1820 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1823 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1824 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1825 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1826 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1829 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1834 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1838 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1839 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1840 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1841 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1844 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1845 if( x264_mb_partition_listX_table[0][part] ) \
1847 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1848 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1852 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1853 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1855 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1857 if( x264_mb_partition_listX_table[1][part] ) \
1859 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1860 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1864 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1865 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1867 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1870 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1874 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1876 x264_mb_load_mv_direct8x8( h, i );
1879 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1880 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1881 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1886 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1889 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1891 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1893 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1895 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1899 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1901 ALIGNED_ARRAY_8( pixel, pix,[2],[8*8] );
1902 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1904 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1905 * than those used by the neighbors */
1906 #define CHECK_NEIGHBOUR(i)\
1908 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1909 if( ref > i_maxref[l] )\
1913 for( int l = 0; l < 2; l++ )
1915 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1916 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1917 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1920 CHECK_NEIGHBOUR( -8 - 1 );
1921 CHECK_NEIGHBOUR( -8 + 0 );
1922 CHECK_NEIGHBOUR( -8 + 2 );
1923 CHECK_NEIGHBOUR( -8 + 4 );
1924 CHECK_NEIGHBOUR( 0 - 1 );
1925 CHECK_NEIGHBOUR( 2*8 - 1 );
1929 /* XXX Needed for x264_mb_predict_mv */
1930 h->mb.i_partition = D_8x8;
1934 for( int i = 0; i < 4; i++ )
1940 int stride[2] = {8,8};
1943 m.i_pixel = PIXEL_8x8;
1944 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1946 for( int l = 0; l < 2; l++ )
1948 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1950 lX->me8x8[i].cost = INT_MAX;
1951 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1953 m.i_ref_cost = REF_COST( l, i_ref );;
1955 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1957 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1958 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1959 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1960 m.cost += m.i_ref_cost;
1962 if( m.cost < lX->me8x8[i].cost )
1964 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1965 a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
1968 /* save mv for predicting other partitions within this MB */
1969 CP32( lX->mvc[i_ref][i+1], m.mv );
1974 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1975 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1976 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1977 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1978 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1979 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1981 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1982 i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
1983 + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
1984 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1986 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1987 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1989 i_part_cost = a->l0.me8x8[i].cost;
1990 h->mb.i_sub_partition[i] = D_L0_8x8;
1991 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1992 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1993 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1994 a->i_cost8x8bi += i_part_cost;
1996 /* XXX Needed for x264_mb_predict_mv */
1997 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2001 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2004 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2007 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2008 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2009 ALIGNED_ARRAY_8( pixel, pix,[2],[8*8] );
2011 /* XXX Needed for x264_mb_predict_mv */
2012 h->mb.i_partition = D_8x8;
2016 for( int i = 0; i < 4; i++ )
2021 int i_part_cost_bi = 0;
2022 int stride[2] = {8,8};
2025 for( int l = 0; l < 2; l++ )
2027 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2028 x264_me_t *m = &lX->me8x8[i];
2029 m->i_pixel = PIXEL_8x8;
2030 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2032 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2033 m->i_ref = lX->me16x16.i_ref;
2035 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2037 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2038 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2039 x264_me_search( h, m, &lX->me16x16.mv, 1 );
2040 a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2041 m->cost += m->i_ref_cost;
2043 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2045 /* save mv for predicting other partitions within this MB */
2046 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2049 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2050 m->mv[0], m->mv[1], 8, 8, weight_none );
2051 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2053 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2054 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2055 i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2056 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2057 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2059 i_part_cost = a->l0.me8x8[i].cost;
2060 h->mb.i_sub_partition[i] = D_L0_8x8;
2061 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2062 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2063 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2064 a->i_cost8x8bi += i_part_cost;
2066 /* XXX Needed for x264_mb_predict_mv */
2067 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2071 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2074 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2076 ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
2077 ALIGNED_4( int16_t mvc[3][2] );
2079 h->mb.i_partition = D_16x8;
2080 a->i_cost16x8bi = 0;
2082 for( int i = 0; i < 2; i++ )
2085 int i_part_cost_bi = 0;
2086 int stride[2] = {16,16};
2089 m.i_pixel = PIXEL_16x8;
2090 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2092 for( int l = 0; l < 2; l++ )
2094 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2095 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2096 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2097 lX->me16x8[i].cost = INT_MAX;
2098 for( int j = 0; j < i_ref8s; j++ )
2100 int i_ref = ref8[j];
2101 m.i_ref_cost = REF_COST( l, i_ref );;
2103 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2105 CP32( mvc[0], lX->mvc[i_ref][0] );
2106 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2107 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2109 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2110 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2111 x264_me_search( h, &m, mvc, 3 );
2112 m.cost += m.i_ref_cost;
2114 if( m.cost < lX->me16x8[i].cost )
2115 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2120 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2121 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2122 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2123 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2124 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2125 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2127 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2128 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2129 + a->l1.me16x8[i].i_ref_cost;
2131 i_part_cost = a->l0.me16x8[i].cost;
2132 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2134 if( a->l1.me16x8[i].cost < i_part_cost )
2136 i_part_cost = a->l1.me16x8[i].cost;
2137 a->i_mb_partition16x8[i] = D_L1_8x8;
2139 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2141 i_part_cost = i_part_cost_bi;
2142 a->i_mb_partition16x8[i] = D_BI_8x8;
2144 a->i_cost16x8bi += i_part_cost;
2146 /* Early termination based on the current SATD score of partition[0]
2147 plus the estimated SATD score of partition[1] */
2148 if( !i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2149 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2151 a->i_cost16x8bi = COST_MAX;
2155 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2159 a->i_mb_type16x8 = B_L0_L0
2160 + (a->i_mb_partition16x8[0]>>2) * 3
2161 + (a->i_mb_partition16x8[1]>>2);
2162 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2165 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2167 ALIGNED_ARRAY_8( pixel, pix,[2],[8*16] );
2168 ALIGNED_4( int16_t mvc[3][2] );
2170 h->mb.i_partition = D_8x16;
2171 a->i_cost8x16bi = 0;
2173 for( int i = 0; i < 2; i++ )
2176 int i_part_cost_bi = 0;
2177 int stride[2] = {8,8};
2180 m.i_pixel = PIXEL_8x16;
2181 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2183 for( int l = 0; l < 2; l++ )
2185 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2186 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2187 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2188 lX->me8x16[i].cost = INT_MAX;
2189 for( int j = 0; j < i_ref8s; j++ )
2191 int i_ref = ref8[j];
2192 m.i_ref_cost = REF_COST( l, i_ref );
2194 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2196 CP32( mvc[0], lX->mvc[i_ref][0] );
2197 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2198 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2200 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2201 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2202 x264_me_search( h, &m, mvc, 3 );
2203 m.cost += m.i_ref_cost;
2205 if( m.cost < lX->me8x16[i].cost )
2206 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2211 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2212 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2213 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2214 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2215 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2217 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2218 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2219 + a->l1.me8x16[i].i_ref_cost;
2221 i_part_cost = a->l0.me8x16[i].cost;
2222 a->i_mb_partition8x16[i] = D_L0_8x8;
2224 if( a->l1.me8x16[i].cost < i_part_cost )
2226 i_part_cost = a->l1.me8x16[i].cost;
2227 a->i_mb_partition8x16[i] = D_L1_8x8;
2229 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2231 i_part_cost = i_part_cost_bi;
2232 a->i_mb_partition8x16[i] = D_BI_8x8;
2234 a->i_cost8x16bi += i_part_cost;
2236 /* Early termination based on the current SATD score of partition[0]
2237 plus the estimated SATD score of partition[1] */
2238 if( !i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2239 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2241 a->i_cost8x16bi = COST_MAX;
2245 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2249 a->i_mb_type8x16 = B_L0_L0
2250 + (a->i_mb_partition8x16[0]>>2) * 3
2251 + (a->i_mb_partition8x16[1]>>2);
2252 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2255 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2257 int thresh = i_satd * 5/4;
2259 h->mb.i_type = P_L0;
2260 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2262 h->mb.i_partition = D_16x16;
2263 x264_analyse_update_cache( h, a );
2264 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2267 if( a->l0.i_cost16x8 <= thresh )
2269 h->mb.i_partition = D_16x8;
2270 x264_analyse_update_cache( h, a );
2271 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2274 a->l0.i_cost16x8 = COST_MAX;
2276 if( a->l0.i_cost8x16 <= thresh )
2278 h->mb.i_partition = D_8x16;
2279 x264_analyse_update_cache( h, a );
2280 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2283 a->l0.i_cost8x16 = COST_MAX;
2285 if( a->l0.i_cost8x8 <= thresh )
2287 h->mb.i_type = P_8x8;
2288 h->mb.i_partition = D_8x8;
2289 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2291 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2292 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2293 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2294 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2295 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2296 * for future blocks are those left over from previous RDO calls. */
2297 for( int i = 0; i < 4; i++ )
2299 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2300 int sub8x8_thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2301 int subtype, btype = D_L0_8x8;
2302 uint64_t bcost = COST_MAX64;
2303 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2306 if( costs[subtype] > sub8x8_thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2308 h->mb.i_sub_partition[i] = subtype;
2309 x264_mb_cache_mv_p8x8( h, a, i );
2310 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2311 COPY2_IF_LT( bcost, cost, btype, subtype );
2313 if( h->mb.i_sub_partition[i] != btype )
2315 h->mb.i_sub_partition[i] = btype;
2316 x264_mb_cache_mv_p8x8( h, a, i );
2321 x264_analyse_update_cache( h, a );
2322 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2325 a->l0.i_cost8x8 = COST_MAX;
2328 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2330 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2332 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2334 h->mb.i_type = B_DIRECT;
2335 /* Assumes direct/skip MC is still in fdec */
2336 /* Requires b-rdo to be done before intra analysis */
2337 h->mb.b_skip_mc = 1;
2338 x264_analyse_update_cache( h, a );
2339 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2340 h->mb.b_skip_mc = 0;
2343 //FIXME not all the update_cache calls are needed
2344 h->mb.i_partition = D_16x16;
2346 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2348 h->mb.i_type = B_L0_L0;
2349 x264_analyse_update_cache( h, a );
2350 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2354 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2356 h->mb.i_type = B_L1_L1;
2357 x264_analyse_update_cache( h, a );
2358 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2362 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2364 h->mb.i_type = B_BI_BI;
2365 x264_analyse_update_cache( h, a );
2366 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2370 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2372 h->mb.i_type = B_8x8;
2373 h->mb.i_partition = D_8x8;
2374 x264_analyse_update_cache( h, a );
2375 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2376 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2380 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2382 h->mb.i_type = a->i_mb_type16x8;
2383 h->mb.i_partition = D_16x8;
2384 x264_analyse_update_cache( h, a );
2385 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2389 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2391 h->mb.i_type = a->i_mb_type8x16;
2392 h->mb.i_partition = D_8x16;
2393 x264_analyse_update_cache( h, a );
2394 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2398 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2402 if( IS_INTRA(h->mb.i_type) )
2405 switch( h->mb.i_partition )
2408 if( h->mb.i_type == B_BI_BI )
2410 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2411 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2415 for( int i = 0; i < 2; i++ )
2416 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2418 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2419 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2423 for( int i = 0; i < 2; i++ )
2424 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2426 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2427 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2431 for( int i = 0; i < 4; i++ )
2432 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2434 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2435 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2441 static inline void x264_mb_analyse_transform( x264_t *h )
2443 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2445 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2448 int i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2449 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2450 int i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2451 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2453 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2454 h->mb.b_skip_mc = 1;
2458 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2460 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2462 x264_analyse_update_cache( h, a );
2463 h->mb.b_transform_8x8 ^= 1;
2464 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2465 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2467 if( *i_rd >= i_rd8 )
2470 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2474 h->mb.b_transform_8x8 ^= 1;
2478 /* Rate-distortion optimal QP selection.
2479 * FIXME: More than half of the benefit of this function seems to be
2480 * in the way it improves the coding of chroma DC (by decimating or
2481 * finding a better way to code a single DC coefficient.)
2482 * There must be a more efficient way to get that portion of the benefit
2483 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2485 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2487 int bcost, cost, failures, prevcost, origcost;
2488 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2489 int last_qp_tried = 0;
2490 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2491 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2493 /* If CBP is already zero, don't raise the quantizer any higher. */
2494 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2496 /* Without psy-RD, require monotonicity when moving quant away from previous
2497 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2498 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2499 * allow 2 failures when moving quant towards previous quant.
2500 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2501 int threshold = (!!h->mb.i_psy_rd);
2502 /* Raise the threshold for failures if we're moving towards the last QP. */
2503 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2504 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2506 h->mb.i_qp = orig_qp;
2508 prevcost = origcost;
2510 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2511 * (up to a point) will too. So, jump down to where the threshold will kick in
2512 * and check the QP there. If the CBP is still empty, skip the main loop.
2513 * If it isn't empty, we would have ended up having to check this QP anyways,
2514 * so as long as we store it for later lookup, we lose nothing. */
2515 int already_checked_qp = -1;
2516 int already_checked_cost = COST_MAX;
2517 if( direction == -1 )
2521 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2522 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2523 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2524 if( !h->mb.cbp[h->mb.i_mb_xy] )
2526 /* If our empty-CBP block is lower QP than the last QP,
2527 * the last QP almost surely doesn't have a CBP either. */
2528 if( h->mb.i_last_qp > h->mb.i_qp )
2532 already_checked_qp = h->mb.i_qp;
2533 h->mb.i_qp = orig_qp;
2537 h->mb.i_qp += direction;
2538 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2540 if( h->mb.i_last_qp == h->mb.i_qp )
2542 if( h->mb.i_qp == already_checked_qp )
2543 cost = already_checked_cost;
2546 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2547 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2548 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2551 /* We can't assume that the costs are monotonic over QPs.
2552 * Tie case-as-failure seems to give better results. */
2553 if( cost < prevcost )
2559 if( failures > threshold )
2561 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2563 h->mb.i_qp += direction;
2567 /* Always try the last block's QP. */
2568 if( !last_qp_tried )
2570 h->mb.i_qp = h->mb.i_last_qp;
2571 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2572 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2573 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2577 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2579 /* Check transform again; decision from before may no longer be optimal. */
2580 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2581 x264_mb_transform_8x8_allowed( h ) )
2583 h->mb.b_transform_8x8 ^= 1;
2584 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2586 h->mb.b_transform_8x8 ^= 1;
2590 /*****************************************************************************
2591 * x264_macroblock_analyse:
2592 *****************************************************************************/
2593 void x264_macroblock_analyse( x264_t *h )
2595 x264_mb_analysis_t analysis;
2596 int i_cost = COST_MAX;
2598 h->mb.i_qp = x264_ratecontrol_mb_qp( h );
2599 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2600 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2601 if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2602 h->mb.i_qp = h->mb.i_last_qp;
2604 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2606 /*--------------------------- Do the analysis ---------------------------*/
2607 if( h->sh.i_type == SLICE_TYPE_I )
2610 if( analysis.i_mbrd )
2611 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2612 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2613 if( analysis.i_mbrd )
2614 x264_intra_rd( h, &analysis, COST_MAX );
2616 i_cost = analysis.i_satd_i16x16;
2617 h->mb.i_type = I_16x16;
2618 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2619 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2620 if( analysis.i_satd_pcm < i_cost )
2621 h->mb.i_type = I_PCM;
2623 else if( analysis.i_mbrd >= 2 )
2624 x264_intra_rd_refine( h, &analysis );
2626 else if( h->sh.i_type == SLICE_TYPE_P )
2630 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2632 analysis.b_try_skip = 0;
2633 if( analysis.b_force_intra )
2635 if( !h->param.analyse.b_psy )
2637 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2638 goto intra_analysis;
2643 /* Fast P_SKIP detection */
2644 if( h->param.analyse.b_fast_pskip )
2646 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2647 // FIXME don't need to check this if the reference frame is done
2649 else if( h->param.analyse.i_subpel_refine >= 3 )
2650 analysis.b_try_skip = 1;
2651 else if( h->mb.i_mb_type_left == P_SKIP ||
2652 h->mb.i_mb_type_top == P_SKIP ||
2653 h->mb.i_mb_type_topleft == P_SKIP ||
2654 h->mb.i_mb_type_topright == P_SKIP )
2655 b_skip = x264_macroblock_probe_pskip( h );
2659 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2663 h->mb.i_type = P_SKIP;
2664 h->mb.i_partition = D_16x16;
2665 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2666 /* Set up MVs for future predictors */
2667 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2668 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2672 const unsigned int flags = h->param.analyse.inter;
2676 int i_satd_inter, i_satd_intra;
2678 x264_mb_analyse_load_costs( h, &analysis );
2680 x264_mb_analyse_inter_p16x16( h, &analysis );
2682 if( h->mb.i_type == P_SKIP )
2684 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2685 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2689 if( flags & X264_ANALYSE_PSUB16x16 )
2691 if( h->param.analyse.b_mixed_references )
2692 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2694 x264_mb_analyse_inter_p8x8( h, &analysis );
2697 /* Select best inter mode */
2699 i_partition = D_16x16;
2700 i_cost = analysis.l0.me16x16.cost;
2702 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2703 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2706 i_partition = D_8x8;
2707 i_cost = analysis.l0.i_cost8x8;
2710 if( flags & X264_ANALYSE_PSUB8x8 )
2712 for( int i = 0; i < 4; i++ )
2714 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2715 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2717 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2718 h->mb.i_sub_partition[i] = D_L0_4x4;
2720 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2721 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2722 h->mb.i_sub_partition[i], D_L0_8x4 );
2724 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2725 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2726 h->mb.i_sub_partition[i], D_L0_4x8 );
2728 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2730 x264_mb_cache_mv_p8x8( h, &analysis, i );
2732 analysis.l0.i_cost8x8 = i_cost;
2736 /* Now do 16x8/8x16 */
2737 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2738 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2739 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2741 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
2742 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2743 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2745 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
2746 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2748 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
2749 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2750 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2752 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
2753 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2756 h->mb.i_partition = i_partition;
2759 //FIXME mb_type costs?
2760 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2764 else if( i_partition == D_16x16 )
2766 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2767 i_cost = analysis.l0.me16x16.cost;
2769 else if( i_partition == D_16x8 )
2771 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2772 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2773 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2775 else if( i_partition == D_8x16 )
2777 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2778 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2779 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2781 else if( i_partition == D_8x8 )
2784 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2786 switch( h->mb.i_sub_partition[i8x8] )
2789 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2790 i_cost += analysis.l0.me8x8[i8x8].cost;
2793 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2794 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2795 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2796 analysis.l0.me8x4[i8x8][1].cost;
2799 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2800 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2801 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2802 analysis.l0.me4x8[i8x8][1].cost;
2806 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2807 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2808 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2809 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2810 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2811 analysis.l0.me4x4[i8x8][1].cost +
2812 analysis.l0.me4x4[i8x8][2].cost +
2813 analysis.l0.me4x4[i8x8][3].cost;
2816 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2822 if( h->mb.b_chroma_me )
2824 x264_mb_analyse_intra_chroma( h, &analysis );
2825 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2826 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2827 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2828 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2831 x264_mb_analyse_intra( h, &analysis, i_cost );
2833 i_satd_inter = i_cost;
2834 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2835 analysis.i_satd_i8x8,
2836 analysis.i_satd_i4x4 );
2838 if( analysis.i_mbrd )
2840 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2842 i_partition = D_16x16;
2843 i_cost = analysis.l0.i_rd16x16;
2844 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2845 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2846 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2847 h->mb.i_type = i_type;
2848 h->mb.i_partition = i_partition;
2849 if( i_cost < COST_MAX )
2850 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2851 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2854 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2855 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2856 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2857 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2859 h->mb.i_type = i_type;
2861 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2863 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2864 * it was an inter block. */
2865 x264_analyse_update_cache( h, &analysis );
2866 x264_macroblock_encode( h );
2867 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2868 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2869 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2870 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2871 goto intra_analysis;
2874 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2876 if( IS_INTRA( h->mb.i_type ) )
2878 x264_intra_rd_refine( h, &analysis );
2880 else if( i_partition == D_16x16 )
2882 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2883 analysis.l0.me16x16.cost = i_cost;
2884 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2886 else if( i_partition == D_16x8 )
2888 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2889 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2890 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2891 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2892 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2893 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2895 else if( i_partition == D_8x16 )
2897 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2898 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2899 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2900 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2901 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2902 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2904 else if( i_partition == D_8x8 )
2906 x264_analyse_update_cache( h, &analysis );
2907 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2909 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2911 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2913 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2915 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2916 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2918 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2920 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2921 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2923 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2925 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2926 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2927 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2928 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2935 else if( h->sh.i_type == SLICE_TYPE_B )
2937 int i_bskip_cost = COST_MAX;
2940 if( analysis.i_mbrd )
2941 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2943 h->mb.i_type = B_SKIP;
2944 if( h->mb.b_direct_auto_write )
2946 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2947 for( int i = 0; i < 2; i++ )
2950 h->sh.b_direct_spatial_mv_pred ^= 1;
2951 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2952 if( analysis.b_direct_available )
2957 b_skip = x264_macroblock_probe_bskip( h );
2959 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2966 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2968 analysis.b_try_skip = 0;
2969 if( analysis.b_direct_available )
2971 if( !h->mb.b_direct_auto_write )
2973 if( analysis.i_mbrd )
2975 i_bskip_cost = ssd_mb( h );
2976 /* 6 = minimum cavlc cost of a non-skipped MB */
2977 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2979 else if( !h->mb.b_direct_auto_write )
2981 /* Conditioning the probe on neighboring block types
2982 * doesn't seem to help speed or quality. */
2983 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
2984 if( h->param.analyse.i_subpel_refine < 3 )
2985 b_skip = analysis.b_try_skip;
2987 /* Set up MVs for future predictors */
2990 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2991 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2992 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
2993 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
2999 const unsigned int flags = h->param.analyse.inter;
3003 h->mb.b_skip_mc = 0;
3004 h->mb.i_type = B_DIRECT;
3006 x264_mb_analyse_load_costs( h, &analysis );
3008 /* select best inter mode */
3009 /* direct must be first */
3010 if( analysis.b_direct_available )
3011 x264_mb_analyse_inter_direct( h, &analysis );
3013 x264_mb_analyse_inter_b16x16( h, &analysis );
3015 if( h->mb.i_type == B_SKIP )
3017 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3018 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3019 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3020 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3025 i_partition = D_16x16;
3026 i_cost = analysis.l0.me16x16.cost;
3027 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3028 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3029 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3031 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
3033 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3034 if( i_bskip_cost < analysis.i_rd16x16direct &&
3035 i_bskip_cost < analysis.i_rd16x16bi &&
3036 i_bskip_cost < analysis.l0.i_rd16x16 &&
3037 i_bskip_cost < analysis.l1.i_rd16x16 )
3039 h->mb.i_type = B_SKIP;
3040 x264_analyse_update_cache( h, &analysis );
3045 if( flags & X264_ANALYSE_BSUB16x16 )
3047 if( h->param.analyse.b_mixed_references )
3048 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3050 x264_mb_analyse_inter_b8x8( h, &analysis );
3052 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3054 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3055 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3056 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3057 for( int i = 0; i < 2; i++ )
3059 int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3060 int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3062 i_best_cost = COST_MAX;
3063 i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3064 i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3065 i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3066 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3067 + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3068 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3069 + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3070 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3071 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3072 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3073 analysis.i_cost_est16x8[i] = i_best_cost;
3076 i_best_cost = COST_MAX;
3077 i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3078 i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3079 i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3080 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3081 + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3082 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3083 + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3084 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3085 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3086 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3087 analysis.i_cost_est8x16[i] = i_best_cost;
3089 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3090 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3091 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3092 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3093 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3094 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3096 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3097 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3098 if( try_16x8_first && i_cost_est16x8bi_total < i_cost )
3100 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3101 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3103 if( i_cost_est8x16bi_total < i_cost )
3105 x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3106 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3108 if( !try_16x8_first && i_cost_est16x8bi_total < i_cost )
3110 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3111 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3115 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3120 else if( i_partition == D_16x16 )
3122 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3123 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3124 if( i_type == B_L0_L0 )
3126 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3127 i_cost = analysis.l0.me16x16.cost
3128 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3130 else if( i_type == B_L1_L1 )
3132 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3133 i_cost = analysis.l1.me16x16.cost
3134 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3136 else if( i_type == B_BI_BI )
3138 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3139 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3142 else if( i_partition == D_16x8 )
3144 for( int i = 0; i < 2; i++ )
3146 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3147 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3148 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3149 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3152 else if( i_partition == D_8x16 )
3154 for( int i = 0; i < 2; i++ )
3156 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3157 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3158 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3159 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3162 else if( i_partition == D_8x8 )
3164 for( int i = 0; i < 4; i++ )
3167 int i_part_cost_old;
3169 int i_part_type = h->mb.i_sub_partition[i];
3170 int b_bidir = (i_part_type == D_BI_8x8);
3172 if( i_part_type == D_DIRECT_8x8 )
3174 if( x264_mb_partition_listX_table[0][i_part_type] )
3176 m = &analysis.l0.me8x8[i];
3177 i_part_cost_old = m->cost;
3178 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3179 m->cost -= i_type_cost;
3180 x264_me_refine_qpel( h, m );
3182 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3184 if( x264_mb_partition_listX_table[1][i_part_type] )
3186 m = &analysis.l1.me8x8[i];
3187 i_part_cost_old = m->cost;
3188 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3189 m->cost -= i_type_cost;
3190 x264_me_refine_qpel( h, m );
3192 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3194 /* TODO: update mvp? */
3198 i_satd_inter = i_cost;
3200 if( analysis.i_mbrd )
3202 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3204 i_cost = i_bskip_cost;
3205 i_partition = D_16x16;
3206 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3207 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3208 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3209 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3210 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3211 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3212 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3214 h->mb.i_type = i_type;
3215 h->mb.i_partition = i_partition;
3218 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3220 if( analysis.i_mbrd )
3222 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3223 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3226 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3227 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3228 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3229 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3231 h->mb.i_type = i_type;
3232 h->mb.i_partition = i_partition;
3234 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3235 x264_intra_rd_refine( h, &analysis );
3236 if( h->mb.i_subpel_refine >= 5 )
3237 x264_refine_bidir( h, &analysis );
3239 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3242 x264_analyse_update_cache( h, &analysis );
3244 if( i_partition == D_16x16 )
3246 if( i_type == B_L0_L0 )
3248 analysis.l0.me16x16.cost = i_cost;
3249 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3251 else if( i_type == B_L1_L1 )
3253 analysis.l1.me16x16.cost = i_cost;
3254 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3256 else if( i_type == B_BI_BI )
3258 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3259 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3262 else if( i_partition == D_16x8 )
3264 for( int i = 0; i < 2; i++ )
3266 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3267 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3268 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3269 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3270 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3271 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3273 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3274 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3278 else if( i_partition == D_8x16 )
3280 for( int i = 0; i < 2; i++ )
3282 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3283 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3284 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3285 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3286 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3287 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3289 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3290 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3294 else if( i_partition == D_8x8 )
3296 for( int i = 0; i < 4; i++ )
3298 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3299 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3300 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3301 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3302 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3304 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3305 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3313 x264_analyse_update_cache( h, &analysis );
3315 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3316 * without realizing it. Check for this and account for it if necessary. */
3317 if( analysis.i_mbrd >= 2 )
3319 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3320 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3321 int list = check_mv_lists[h->mb.i_type] - 1;
3322 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3323 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3324 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3325 h->mb.i_partition = D_16x16;
3328 if( !analysis.i_mbrd )
3329 x264_mb_analyse_transform( h );
3331 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3332 x264_mb_analyse_qp_rd( h, &analysis );
3334 h->mb.b_trellis = h->param.analyse.i_trellis;
3335 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3336 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3337 x264_psy_trellis_init( h, 0 );
3338 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3339 h->mb.i_skip_intra = 0;
3342 /*-------------------- Update MB from the analysis ----------------------*/
3343 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3345 switch( h->mb.i_type )
3348 for( int i = 0; i < 16; i++ )
3349 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3351 x264_mb_analyse_intra_chroma( h, a );
3354 for( int i = 0; i < 4; i++ )
3355 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3357 x264_mb_analyse_intra_chroma( h, a );
3360 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3361 x264_mb_analyse_intra_chroma( h, a );
3368 switch( h->mb.i_partition )
3371 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3372 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3376 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3377 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3378 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3379 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3383 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3384 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3385 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3386 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3390 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3396 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3397 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3398 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3399 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3400 for( int i = 0; i < 4; i++ )
3401 x264_mb_cache_mv_p8x8( h, a, i );
3406 h->mb.i_partition = D_16x16;
3407 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3408 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3414 h->mb.i_partition = h->mb.cache.direct_partition;
3415 x264_mb_load_mv_direct8x8( h, 0 );
3416 x264_mb_load_mv_direct8x8( h, 1 );
3417 x264_mb_load_mv_direct8x8( h, 2 );
3418 x264_mb_load_mv_direct8x8( h, 3 );
3422 /* optimize: cache might not need to be rewritten */
3423 for( int i = 0; i < 4; i++ )
3424 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3427 default: /* the rest of the B types */
3428 switch( h->mb.i_partition )
3431 switch( h->mb.i_type )
3434 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3435 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3437 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3438 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3439 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3442 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3443 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3444 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3446 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3447 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3450 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3451 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3453 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3454 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3459 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3460 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3463 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3464 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3467 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3473 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3475 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3478 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3481 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3482 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3484 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3485 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3486 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3487 h->mb.cache.mv[l][x264_scan8[15]][0],
3488 h->mb.cache.mv[l][x264_scan8[15]][1] );
3489 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3490 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3491 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3492 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3493 x264_mb_analyse_intra( h, a, COST_MAX );
3494 h->mb.i_type = I_16x16;
3495 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3496 x264_mb_analyse_intra_chroma( h, a );
3503 #include "slicetype.c"