1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[7];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
117 int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
118 int i_cost_est16x8[2]; /* Per-partition estimated cost */
119 int i_cost_est8x16[2];
128 int i_mb_partition16x8[2]; /* mb_partition_e */
129 int i_mb_partition8x16[2];
130 int i_mb_type16x8; /* mb_class_e */
133 int b_direct_available;
135 } x264_mb_analysis_t;
137 /* lambda = pow(2,qp/6-2) */
138 const uint8_t x264_lambda_tab[52] = {
139 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
140 1, 1, 1, 1, /* 8-11 */
141 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
142 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
143 6, 7, 8, 9,10,11,13,14, /* 28-35 */
144 16,18,20,23,25,29,32,36, /* 36-43 */
145 40,45,51,57,64,72,81,91 /* 44-51 */
148 /* lambda2 = pow(lambda,2) * .9 * 256 */
149 const int x264_lambda2_tab[52] = {
150 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
151 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
152 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
153 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
154 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
155 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
156 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
159 const uint8_t x264_exp2_lut[64] = {
160 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
161 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
162 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
163 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
166 const float x264_log2_lut[128] = {
167 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
168 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
169 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
170 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
171 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
172 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
173 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
174 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
175 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
176 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
177 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
178 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
179 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
180 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
181 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
182 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
185 /* Avoid an int/float conversion. */
186 const float x264_log2_lz_lut[32] = {
187 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
190 // should the intra and inter lambdas be different?
191 // I'm just matching the behaviour of deadzone quant.
192 static const int x264_trellis_lambda2_tab[2][52] = {
193 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
194 { 46, 58, 73, 92, 117, 147,
195 185, 233, 294, 370, 466, 587,
196 740, 932, 1174, 1480, 1864, 2349,
197 2959, 3728, 4697, 5918, 7457, 9395,
198 11837, 14914, 18790, 23674, 29828, 37581,
199 47349, 59656, 75163, 94699, 119313, 150326,
200 189399, 238627, 300652, 378798, 477255, 601304,
201 757596, 954511, 1202608, 1515192, 1909022, 2405217,
202 3030384, 3818045, 4810435, 6060769 },
203 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
204 { 27, 34, 43, 54, 68, 86,
205 108, 136, 172, 216, 273, 343,
206 433, 545, 687, 865, 1090, 1374,
207 1731, 2180, 2747, 3461, 4361, 5494,
208 6922, 8721, 10988, 13844, 17442, 21976,
209 27688, 34885, 43953, 55377, 69771, 87906,
210 110755, 139543, 175813, 221511, 279087, 351627,
211 443023, 558174, 703255, 886046, 1116348, 1406511,
212 1772093, 2232697, 2813022, 3544186 }
215 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
216 16, 20, 25, 32, 40, 50,
217 64, 80, 101, 128, 161, 203,
218 256, 322, 406, 512, 645, 812,
219 1024, 1290, 1625, 2048, 2580, 3250,
220 4096, 5160, 6501, 8192, 10321, 13003,
221 16384, 20642, 26007, 32768, 41285, 52015,
225 /* TODO: calculate CABAC costs */
226 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
227 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
229 static const uint8_t i_mb_b16x8_cost_table[17] = {
230 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
232 static const uint8_t i_sub_mb_b_cost_table[13] = {
233 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
235 static const uint8_t i_sub_mb_p_cost_table[4] = {
239 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
241 static uint16_t x264_cost_ref[92][3][33];
242 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
244 int x264_analyse_init_costs( x264_t *h, int qp )
246 int lambda = x264_lambda_tab[qp];
247 if( h->cost_mv[lambda] )
249 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
250 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
251 h->cost_mv[lambda] += 2*4*2048;
252 for( int i = 0; i <= 2*4*2048; i++ )
254 h->cost_mv[lambda][-i] =
255 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
257 x264_pthread_mutex_lock( &cost_ref_mutex );
258 for( int i = 0; i < 3; i++ )
259 for( int j = 0; j < 33; j++ )
260 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
261 x264_pthread_mutex_unlock( &cost_ref_mutex );
262 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
264 for( int j = 0; j < 4; j++ )
266 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
267 h->cost_mv_fpel[lambda][j] += 2*2048;
268 for( int i = -2*2048; i < 2*2048; i++ )
269 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
277 void x264_analyse_free_costs( x264_t *h )
279 for( int i = 0; i < 92; i++ )
282 x264_free( h->cost_mv[i] - 2*4*2048 );
283 if( h->cost_mv_fpel[i][0] )
284 for( int j = 0; j < 4; j++ )
285 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
289 void x264_analyse_weight_frame( x264_t *h, int end )
291 for( int j = 0; j < h->i_ref0; j++ )
293 if( h->sh.weight[j][0].weightfn )
295 x264_frame_t *frame = h->fref0[j];
296 int width = frame->i_width[0] + 2*PADH;
297 int i_padv = PADV << h->param.b_interlaced;
299 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
300 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
301 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
302 h->fenc->i_lines_weighted += height;
304 for( int k = j; k < h->i_ref0; k++ )
305 if( h->sh.weight[k][0].weightfn )
307 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
308 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
309 src + offset, frame->i_stride[0],
310 width, height, &h->sh.weight[k][0] );
317 /* initialize an array of lambda*nbits for all possible mvs */
318 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
320 a->p_cost_mv = h->cost_mv[a->i_lambda];
321 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
322 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
325 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
327 /* conduct the analysis using this lamda and QP */
328 a->i_qp = h->mb.i_qp = i_qp;
329 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
331 a->i_lambda = x264_lambda_tab[i_qp];
332 a->i_lambda2 = x264_lambda2_tab[i_qp];
334 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
335 if( h->param.analyse.i_trellis )
337 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
338 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
339 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
340 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
342 h->mb.i_psy_rd_lambda = a->i_lambda;
343 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
344 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
347 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
349 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
351 /* mbrd == 1 -> RD mode decision */
352 /* mbrd == 2 -> RD refinement */
353 /* mbrd == 3 -> QPRD */
354 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
356 x264_mb_analyse_init_qp( h, a, i_qp );
358 h->mb.b_transform_8x8 = 0;
359 h->mb.b_noise_reduction = 0;
365 a->i_satd_i8x8chroma = COST_MAX;
367 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
368 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
372 h->mb.b_lossless ? 0 :
374 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
376 /* II: Inter part P/B frame */
377 if( h->sh.i_type != SLICE_TYPE_I )
379 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
380 // limit motion search to a slightly smaller range than the theoretical limit,
381 // since the search may go a few iterations past its given range
382 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
384 /* Calculate max allowed MV range */
385 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
386 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
387 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
388 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
389 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
390 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
392 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
393 int max_mv = max_x - 4*16*h->mb.i_mb_x;
394 /* If we're left of the refresh bar, don't reference right of it. */
395 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
396 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
398 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
399 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
400 if( h->mb.i_mb_x == 0 )
402 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
403 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
404 int thread_mvy_range = i_fmv_range;
406 if( h->i_thread_frames > 1 )
408 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
409 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
410 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
412 x264_frame_t **fref = i ? h->fref1 : h->fref0;
413 int i_ref = i ? h->i_ref1 : h->i_ref0;
414 for( int j = 0; j < i_ref; j++ )
416 x264_frame_cond_wait( fref[j]->orig, thresh );
417 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
421 if( h->param.b_deterministic )
422 thread_mvy_range = h->param.analyse.i_mv_range_thread;
423 if( h->mb.b_interlaced )
424 thread_mvy_range >>= 1;
426 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
429 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
430 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
431 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
432 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
433 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
434 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
435 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
443 a->l0.i_cost8x16 = COST_MAX;
444 if( h->sh.i_type == SLICE_TYPE_B )
449 a->i_cost8x8direct[0] =
450 a->i_cost8x8direct[1] =
451 a->i_cost8x8direct[2] =
452 a->i_cost8x8direct[3] =
461 a->i_cost16x16direct =
464 a->i_cost8x16bi = COST_MAX;
466 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
467 for( int i = 0; i < 4; i++ )
471 a->l0.i_cost4x8[i] = COST_MAX;
474 /* Fast intra decision */
475 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
477 /* Always run in fast-intra mode for subme < 3 */
478 if( h->mb.i_subpel_refine > 2 &&
479 ( IS_INTRA( h->mb.i_mb_type_left ) ||
480 IS_INTRA( h->mb.i_mb_type_top ) ||
481 IS_INTRA( h->mb.i_mb_type_topleft ) ||
482 IS_INTRA( h->mb.i_mb_type_topright ) ||
483 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] )) ||
484 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
485 { /* intra is likely */ }
492 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
493 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
495 a->b_force_intra = 1;
499 a->b_force_intra = 0;
503 /* Prediction modes allowed for various combinations of neighbors. */
504 /* Terminated by a -1. */
505 /* In order, no neighbors, left, top, top/left, top/left/topleft */
506 static const int8_t i16x16_mode_available[5][5] =
508 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
509 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
510 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
511 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
512 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
515 static const int8_t i8x8chroma_mode_available[5][5] =
517 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
518 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
519 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
520 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
521 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
524 static const int8_t i4x4_mode_available[5][10] =
526 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
527 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
528 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
529 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
530 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
533 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
535 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
536 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
539 static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
541 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
542 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
545 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour )
547 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
548 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
551 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
552 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
554 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
556 if( do_both_dct || h->mb.b_transform_8x8 )
557 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
558 if( do_both_dct || !h->mb.b_transform_8x8 )
559 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
562 /* Reset fenc satd scores cache for psy RD */
563 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
565 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
566 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
567 if( !h->mb.i_psy_rd )
569 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
570 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
572 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
575 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
577 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
579 if( a->i_satd_i8x8chroma < COST_MAX )
582 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
584 /* 8x8 prediction selection for chroma */
585 if( predict_mode[3] >= 0 && b_merged_satd )
587 int satdu[4], satdv[4];
588 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
589 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
590 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
591 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
592 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
593 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
595 for( ; *predict_mode >= 0; predict_mode++ )
597 int i_mode = *predict_mode;
598 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
600 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
601 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
606 for( ; *predict_mode >= 0; predict_mode++ )
609 int i_mode = *predict_mode;
611 /* we do the prediction */
612 if( h->mb.b_lossless )
613 x264_predict_lossless_8x8_chroma( h, i_mode );
616 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
617 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
620 /* we calculate the cost */
621 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
622 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
623 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
625 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
626 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
630 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
633 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
635 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
636 uint8_t *p_src = h->mb.pic.p_fenc[0];
637 uint8_t *p_dst = h->mb.pic.p_fdec[0];
640 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
642 /*---------------- Try all mode and calculate their score ---------------*/
644 /* 16x16 prediction selection */
645 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
647 if( b_merged_satd && predict_mode[3] >= 0 )
649 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
650 h->predict_16x16[I_PRED_16x16_P]( p_dst );
651 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
652 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
653 for( int i = 0; i < 4; i++ )
655 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
656 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
661 for( ; *predict_mode >= 0; predict_mode++ )
664 int i_mode = *predict_mode;
666 if( h->mb.b_lossless )
667 x264_predict_lossless_16x16( h, i_mode );
669 h->predict_16x16[i_mode]( p_dst );
671 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
672 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
673 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
674 a->i_satd_i16x16_dir[i_mode] = i_satd;
678 if( h->sh.i_type == SLICE_TYPE_B )
679 /* cavlc mb type prefix */
680 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
682 /* Not heavily tuned */
683 const uint8_t i16x16_thresh[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
684 if( a->b_fast_intra && a->i_satd_i16x16 > (i16x16_thresh[h->mb.i_subpel_refine]*i_satd_inter)>>1 )
687 /* 8x8 prediction selection */
688 if( flags & X264_ANALYSE_I8x8 )
690 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
691 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
692 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
694 // FIXME some bias like in i4x4?
695 int i_cost = a->i_lambda * 4; /* base predmode costs */
696 h->mb.i_cbp_luma = 0;
697 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
699 if( h->sh.i_type == SLICE_TYPE_B )
700 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
702 for( idx = 0;; idx++ )
706 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
707 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
708 int i_best = COST_MAX;
709 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
711 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
712 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
714 if( b_merged_satd && predict_mode[8] >= 0 )
717 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
718 satd[i_pred_mode] -= 3 * a->i_lambda;
719 for( int i = 2; i >= 0; i-- )
721 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i];
722 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
727 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
730 int i_mode = *predict_mode;
732 if( h->mb.b_lossless )
733 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
735 h->predict_8x8[i_mode]( p_dst_by, edge );
737 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
738 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
739 i_satd -= 3 * a->i_lambda;
741 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
742 a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * a->i_lambda;
744 i_cost += i_best + 3 * a->i_lambda;
746 if( idx == 3 || i_cost > i_satd_thresh )
749 /* we need to encode this block now (for next ones) */
750 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
751 x264_mb_encode_i8x8( h, idx, a->i_qp );
753 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
758 a->i_satd_i8x8 = i_cost;
759 if( h->mb.i_skip_intra )
761 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
762 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
763 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
764 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
765 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
766 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
767 if( h->mb.i_skip_intra == 2 )
768 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
773 static const uint16_t cost_div_fix8[3] = {1024,512,341};
774 a->i_satd_i8x8 = COST_MAX;
775 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
777 /* Not heavily tuned */
778 const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
779 if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
783 /* 4x4 prediction selection */
784 if( flags & X264_ANALYSE_I4x4 )
786 int i_cost = a->i_lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
787 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
788 h->mb.i_cbp_luma = 0;
789 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
791 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
793 if( h->sh.i_type == SLICE_TYPE_B )
794 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
796 for( idx = 0;; idx++ )
798 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
799 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
800 int i_best = COST_MAX;
801 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
803 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
805 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
806 /* emulate missing topright samples */
807 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
809 if( b_merged_satd && predict_mode[5] >= 0 )
812 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
813 satd[i_pred_mode] -= 3 * a->i_lambda;
814 for( int i = 2; i >= 0; i-- )
815 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
821 for( ; *predict_mode >= 0; predict_mode++ )
824 int i_mode = *predict_mode;
826 if( h->mb.b_lossless )
827 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
829 h->predict_4x4[i_mode]( p_dst_by );
831 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
832 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
834 i_satd -= a->i_lambda * 3;
838 a->i_predict4x4[idx] = i_mode;
843 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
846 i_cost += i_best + 3 * a->i_lambda;
848 if( i_cost > i_satd_thresh || idx == 15 )
851 /* we need to encode this block now (for next ones) */
852 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
853 x264_mb_encode_i4x4( h, idx, a->i_qp );
855 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
859 a->i_satd_i4x4 = i_cost;
860 if( h->mb.i_skip_intra )
862 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
863 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
864 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
865 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
866 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
867 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
868 if( h->mb.i_skip_intra == 2 )
869 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
873 a->i_satd_i4x4 = COST_MAX;
877 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
879 if( a->i_satd_i16x16 <= i_satd_thresh )
881 h->mb.i_type = I_16x16;
882 x264_analyse_update_cache( h, a );
883 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
886 a->i_satd_i16x16 = COST_MAX;
888 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
890 h->mb.i_type = I_4x4;
891 x264_analyse_update_cache( h, a );
892 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
895 a->i_satd_i4x4 = COST_MAX;
897 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
899 h->mb.i_type = I_8x8;
900 x264_analyse_update_cache( h, a );
901 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
902 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
905 a->i_satd_i8x8 = COST_MAX;
908 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
910 uint8_t *p_dst = h->mb.pic.p_fdec[0];
913 uint64_t i_satd, i_best;
914 h->mb.i_skip_intra = 0;
916 if( h->mb.i_type == I_16x16 )
918 int old_pred_mode = a->i_predict16x16;
919 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
920 int i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
921 i_best = a->i_satd_i16x16;
922 for( ; *predict_mode >= 0; predict_mode++ )
924 int i_mode = *predict_mode;
925 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
927 h->mb.i_intra16x16_pred_mode = i_mode;
928 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
929 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
933 /* RD selection for chroma prediction */
934 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
935 if( predict_mode[1] >= 0 )
937 int8_t predict_mode_sorted[4];
939 int i_thresh = a->i_satd_i8x8chroma * 5/4;
941 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
943 int i_mode = *predict_mode;
944 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
945 predict_mode_sorted[i_max++] = i_mode;
950 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
951 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
952 /* the previous thing encoded was x264_intra_rd(), so the pixels and
953 * coefs for the current chroma mode are still around, so we only
954 * have to recount the bits. */
955 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
956 for( int i = 0; i < i_max; i++ )
958 int i_mode = predict_mode_sorted[i];
959 if( h->mb.b_lossless )
960 x264_predict_lossless_8x8_chroma( h, i_mode );
963 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
964 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
966 /* if we've already found a mode that needs no residual, then
967 * probably any mode with a residual will be worse.
968 * so avoid dct on the remaining modes to improve speed. */
969 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
970 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
972 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
973 h->mb.i_cbp_chroma = i_cbp_chroma_best;
977 if( h->mb.i_type == I_4x4 )
979 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
981 for( int idx = 0; idx < 16; idx++ )
983 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
986 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
988 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
989 /* emulate missing topright samples */
990 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
992 for( ; *predict_mode >= 0; predict_mode++ )
994 int i_mode = *predict_mode;
995 if( h->mb.b_lossless )
996 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
998 h->predict_4x4[i_mode]( p_dst_by );
999 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1001 if( i_best > i_satd )
1003 a->i_predict4x4[idx] = i_mode;
1005 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1006 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1007 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1008 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1009 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1013 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1014 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1015 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1016 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1017 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1019 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1022 else if( h->mb.i_type == I_8x8 )
1024 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1025 for( int idx = 0; idx < 4; idx++ )
1027 uint64_t pels_h = 0;
1029 uint16_t i_nnz[2] = {0}; //shut up gcc
1031 int cbp_luma_new = 0;
1032 int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1034 i_best = COST_MAX64;
1038 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1039 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1040 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1042 for( ; *predict_mode >= 0; predict_mode++ )
1044 int i_mode = *predict_mode;
1045 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1048 if( h->mb.b_lossless )
1049 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1051 h->predict_8x8[i_mode]( p_dst_by, edge );
1052 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1053 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1055 if( i_best > i_satd )
1057 a->i_predict8x8[idx] = i_mode;
1058 cbp_luma_new = h->mb.i_cbp_luma;
1061 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1063 for( int j = 0; j < 7; j++ )
1064 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1065 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1066 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1069 a->i_cbp_i8x8_luma = cbp_luma_new;
1070 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1072 for( int j = 0; j < 7; j++ )
1073 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1074 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1075 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1077 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1082 #define LOAD_FENC( m, src, xoff, yoff) \
1083 (m)->p_cost_mv = a->p_cost_mv; \
1084 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1085 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1086 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1087 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1088 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1090 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1091 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1092 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1093 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1094 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1095 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1096 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1097 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1098 (m)->weight = weight_none; \
1101 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1102 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1103 (m)->weight = h->sh.weight[i_ref];
1105 #define REF_COST(list, ref) \
1106 (a->p_cost_ref[list][ref])
1108 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1112 ALIGNED_4( int16_t mvc[8][2] );
1113 int i_halfpel_thresh = INT_MAX;
1114 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1116 /* 16x16 Search on all ref frame */
1117 m.i_pixel = PIXEL_16x16;
1118 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1120 a->l0.me16x16.cost = INT_MAX;
1121 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1123 m.i_ref_cost = REF_COST( 0, i_ref );
1124 i_halfpel_thresh -= m.i_ref_cost;
1126 /* search with ref */
1127 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1128 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1130 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1132 if( h->mb.ref_blind_dupe == i_ref )
1134 CP32( m.mv, a->l0.mvc[0][0] );
1135 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1139 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1140 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1143 /* save mv for predicting neighbors */
1144 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1145 CP32( a->l0.mvc[i_ref][0], m.mv );
1147 /* early termination
1148 * SSD threshold would probably be better than SATD */
1151 && m.cost-m.cost_mv < 300*a->i_lambda
1152 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1153 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1154 && x264_macroblock_probe_pskip( h ) )
1156 h->mb.i_type = P_SKIP;
1157 x264_analyse_update_cache( h, a );
1158 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1162 m.cost += m.i_ref_cost;
1163 i_halfpel_thresh += m.i_ref_cost;
1165 if( m.cost < a->l0.me16x16.cost )
1166 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1169 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1170 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1172 h->mb.i_type = P_L0;
1175 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1176 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1178 h->mb.i_partition = D_16x16;
1179 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1180 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1181 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1182 h->mb.i_type = P_SKIP;
1187 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1190 uint8_t **p_fenc = h->mb.pic.p_fenc;
1191 int i_maxref = h->mb.pic.i_fref[0]-1;
1193 h->mb.i_partition = D_8x8;
1195 #define CHECK_NEIGHBOUR(i)\
1197 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1198 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1202 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1203 * than those used by the neighbors */
1204 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1205 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1208 CHECK_NEIGHBOUR( -8 - 1 );
1209 CHECK_NEIGHBOUR( -8 + 0 );
1210 CHECK_NEIGHBOUR( -8 + 2 );
1211 CHECK_NEIGHBOUR( -8 + 4 );
1212 CHECK_NEIGHBOUR( 0 - 1 );
1213 CHECK_NEIGHBOUR( 2*8 - 1 );
1215 #undef CHECK_NEIGHBOUR
1217 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1218 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1220 for( int i = 0; i < 4; i++ )
1222 x264_me_t *l0m = &a->l0.me8x8[i];
1226 m.i_pixel = PIXEL_8x8;
1228 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1229 l0m->cost = INT_MAX;
1230 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1232 m.i_ref_cost = REF_COST( 0, i_ref );
1234 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1235 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1237 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1238 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1239 if( h->mb.ref_blind_dupe == i_ref )
1241 CP32( m.mv, a->l0.mvc[0][i+1] );
1242 x264_me_refine_qpel_refdupe( h, &m, NULL );
1245 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1247 m.cost += m.i_ref_cost;
1249 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1251 if( m.cost < l0m->cost )
1252 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1253 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1254 i_ref = h->mb.ref_blind_dupe;
1258 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1259 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1261 a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1263 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1264 are effectively zero. */
1265 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1266 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1269 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1270 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1271 /* P_8x8 ref0 has no ref cost */
1272 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1273 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1274 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1275 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1276 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1279 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1281 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1282 * reference frame flags. Thus, if we're not doing mixedrefs, just
1283 * don't bother analysing the dupes. */
1284 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1285 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1286 uint8_t **p_fenc = h->mb.pic.p_fenc;
1288 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1290 /* XXX Needed for x264_mb_predict_mv */
1291 h->mb.i_partition = D_8x8;
1294 CP32( mvc[0], a->l0.me16x16.mv );
1296 for( int i = 0; i < 4; i++ )
1298 x264_me_t *m = &a->l0.me8x8[i];
1302 m->i_pixel = PIXEL_8x8;
1303 m->i_ref_cost = i_ref_cost;
1305 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1306 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1307 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1309 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1310 x264_me_search( h, m, mvc, i_mvc );
1312 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1314 CP32( mvc[i_mvc], m->mv );
1317 a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1320 m->cost += i_ref_cost;
1321 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1322 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1325 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1326 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1327 /* theoretically this should include 4*ref_cost,
1328 * but 3 seems a better approximation of cabac. */
1329 if( h->param.b_cabac )
1330 a->l0.i_cost8x8 -= i_ref_cost;
1331 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1332 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1335 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1338 uint8_t **p_fenc = h->mb.pic.p_fenc;
1339 ALIGNED_4( int16_t mvc[3][2] );
1341 /* XXX Needed for x264_mb_predict_mv */
1342 h->mb.i_partition = D_16x8;
1344 for( int i = 0; i < 2; i++ )
1346 x264_me_t *l0m = &a->l0.me16x8[i];
1347 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1348 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1349 const int ref8[2] = { minref, maxref };
1350 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1352 m.i_pixel = PIXEL_16x8;
1354 LOAD_FENC( &m, p_fenc, 0, 8*i );
1355 l0m->cost = INT_MAX;
1356 for( int j = 0; j < i_ref8s; j++ )
1358 const int i_ref = ref8[j];
1359 m.i_ref_cost = REF_COST( 0, i_ref );
1361 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1362 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1363 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1364 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1366 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1367 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1369 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1370 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1371 /* We can only take this shortcut if the first search was performed on ref0. */
1372 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1374 /* We can just leave the MV from the previous ref search. */
1375 x264_me_refine_qpel_refdupe( h, &m, NULL );
1378 x264_me_search( h, &m, mvc, 3 );
1380 m.cost += m.i_ref_cost;
1382 if( m.cost < l0m->cost )
1383 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1386 /* Early termination based on the current SATD score of partition[0]
1387 plus the estimated SATD score of partition[1] */
1388 if( !i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1390 a->l0.i_cost16x8 = COST_MAX;
1394 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1395 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1398 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1401 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1404 uint8_t **p_fenc = h->mb.pic.p_fenc;
1405 ALIGNED_4( int16_t mvc[3][2] );
1407 /* XXX Needed for x264_mb_predict_mv */
1408 h->mb.i_partition = D_8x16;
1410 for( int i = 0; i < 2; i++ )
1412 x264_me_t *l0m = &a->l0.me8x16[i];
1413 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1414 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1415 const int ref8[2] = { minref, maxref };
1416 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1418 m.i_pixel = PIXEL_8x16;
1420 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1421 l0m->cost = INT_MAX;
1422 for( int j = 0; j < i_ref8s; j++ )
1424 const int i_ref = ref8[j];
1425 m.i_ref_cost = REF_COST( 0, i_ref );
1427 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1428 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1429 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1431 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1432 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1434 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1435 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1436 /* We can only take this shortcut if the first search was performed on ref0. */
1437 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1439 /* We can just leave the MV from the previous ref search. */
1440 x264_me_refine_qpel_refdupe( h, &m, NULL );
1443 x264_me_search( h, &m, mvc, 3 );
1445 m.cost += m.i_ref_cost;
1447 if( m.cost < l0m->cost )
1448 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1451 /* Early termination based on the current SATD score of partition[0]
1452 plus the estimated SATD score of partition[1] */
1453 if( !i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1455 a->l0.i_cost8x16 = COST_MAX;
1459 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1460 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1463 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1466 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1468 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1469 uint8_t *pix2 = pix1+8;
1470 const int i_stride = h->mb.pic.i_stride[1];
1471 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1472 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1473 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1474 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1475 x264_weight_t *weight = h->sh.weight[i_ref];
1477 #define CHROMA4x4MC( width, height, me, x, y ) \
1478 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1479 if( weight[1].weightfn ) \
1480 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1481 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1482 if( weight[2].weightfn ) \
1483 weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1486 if( pixel == PIXEL_4x4 )
1488 x264_me_t *m = a->l0.me4x4[i8x8];
1489 CHROMA4x4MC( 2,2, m[0], 0,0 );
1490 CHROMA4x4MC( 2,2, m[1], 2,0 );
1491 CHROMA4x4MC( 2,2, m[2], 0,2 );
1492 CHROMA4x4MC( 2,2, m[3], 2,2 );
1494 else if( pixel == PIXEL_8x4 )
1496 x264_me_t *m = a->l0.me8x4[i8x8];
1497 CHROMA4x4MC( 4,2, m[0], 0,0 );
1498 CHROMA4x4MC( 4,2, m[1], 0,2 );
1502 x264_me_t *m = a->l0.me4x8[i8x8];
1503 CHROMA4x4MC( 2,4, m[0], 0,0 );
1504 CHROMA4x4MC( 2,4, m[1], 2,0 );
1507 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1508 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1511 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1513 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1514 uint8_t **p_fenc = h->mb.pic.p_fenc;
1515 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1517 /* XXX Needed for x264_mb_predict_mv */
1518 h->mb.i_partition = D_8x8;
1520 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1522 const int idx = 4*i8x8 + i4x4;
1523 const int x4 = block_idx_x[idx];
1524 const int y4 = block_idx_y[idx];
1525 const int i_mvc = (i4x4 == 0);
1527 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1529 m->i_pixel = PIXEL_4x4;
1531 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1532 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1533 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1535 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1536 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1538 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1540 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1541 a->l0.me4x4[i8x8][1].cost +
1542 a->l0.me4x4[i8x8][2].cost +
1543 a->l0.me4x4[i8x8][3].cost +
1544 REF_COST( 0, i_ref ) +
1545 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1546 if( h->mb.b_chroma_me )
1547 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1550 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1552 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1553 uint8_t **p_fenc = h->mb.pic.p_fenc;
1554 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1556 /* XXX Needed for x264_mb_predict_mv */
1557 h->mb.i_partition = D_8x8;
1559 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1561 const int idx = 4*i8x8 + 2*i8x4;
1562 const int x4 = block_idx_x[idx];
1563 const int y4 = block_idx_y[idx];
1564 const int i_mvc = (i8x4 == 0);
1566 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1568 m->i_pixel = PIXEL_8x4;
1570 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1571 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1572 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1574 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1575 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1577 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1579 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1580 REF_COST( 0, i_ref ) +
1581 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1582 if( h->mb.b_chroma_me )
1583 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1586 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1588 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1589 uint8_t **p_fenc = h->mb.pic.p_fenc;
1590 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1592 /* XXX Needed for x264_mb_predict_mv */
1593 h->mb.i_partition = D_8x8;
1595 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1597 const int idx = 4*i8x8 + i4x8;
1598 const int x4 = block_idx_x[idx];
1599 const int y4 = block_idx_y[idx];
1600 const int i_mvc = (i4x8 == 0);
1602 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1604 m->i_pixel = PIXEL_4x8;
1606 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1607 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1608 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1610 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1611 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1613 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1615 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1616 REF_COST( 0, i_ref ) +
1617 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1618 if( h->mb.b_chroma_me )
1619 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1622 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1624 /* Assumes that fdec still contains the results of
1625 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1627 uint8_t *p_fenc = h->mb.pic.p_fenc[0];
1628 uint8_t *p_fdec = h->mb.pic.p_fdec[0];
1630 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1631 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1632 for( int i = 0; i < 4; i++ )
1634 const int x = (i&1)*8;
1635 const int y = (i>>1)*8;
1636 a->i_cost16x16direct +=
1637 a->i_cost8x8direct[i] =
1638 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1641 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1644 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
1647 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1649 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1650 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1651 uint8_t *src0, *src1;
1652 int stride0 = 16, stride1 = 16;
1654 ALIGNED_4( int16_t mvc[9][2] );
1655 int try_skip = a->b_try_skip;
1656 int list1_skipped = 0;
1657 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1658 int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL,
1659 h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL};
1662 m.i_pixel = PIXEL_16x16;
1664 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1666 /* 16x16 Search on list 0 and list 1 */
1667 a->l0.me16x16.cost = INT_MAX;
1668 a->l1.me16x16.cost = INT_MAX;
1669 for( int l = 1; l >= 0; )
1671 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1673 /* This loop is extremely munged in order to facilitate the following order of operations,
1674 * necessary for an efficient fast skip.
1675 * 1. Search list1 ref0.
1676 * 2. Search list0 ref0.
1678 * 4. Search the rest of list0.
1679 * 5. Go back and finish list1.
1681 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1683 if( try_skip && l == 1 && i_ref > 0 )
1689 m.i_ref_cost = REF_COST( l, i_ref );
1691 /* search with ref */
1692 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1693 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1694 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1695 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
1698 m.cost += m.i_ref_cost;
1700 if( m.cost < lX->me16x16.cost )
1701 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1703 /* save mv for predicting neighbors */
1704 CP32( lX->mvc[i_ref][0], m.mv );
1705 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1707 /* Fast skip detection. */
1708 if( i_ref == 0 && try_skip )
1710 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
1711 abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
1717 /* We already tested skip */
1718 h->mb.i_type = B_SKIP;
1719 x264_analyse_update_cache( h, a );
1724 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
1726 if( list1_skipped && l == 0 )
1732 /* get cost of BI mode */
1733 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1734 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1735 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1736 src0 = h->mc.get_ref( pix0, &stride0,
1737 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1738 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1739 src1 = h->mc.get_ref( pix1, &stride1,
1740 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1741 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1743 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1745 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1747 + a->l0.bi16x16.cost_mv
1748 + a->l1.bi16x16.cost_mv;
1750 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1751 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1753 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1754 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1755 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1756 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1757 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1758 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1759 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1760 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1761 + ref_costs + l0_mv_cost + l1_mv_cost;
1762 if( cost00 < a->i_cost16x16bi )
1764 M32( a->l0.bi16x16.mv ) = 0;
1765 M32( a->l1.bi16x16.mv ) = 0;
1766 a->l0.bi16x16.cost_mv = l0_mv_cost;
1767 a->l1.bi16x16.cost_mv = l1_mv_cost;
1768 a->i_cost16x16bi = cost00;
1773 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1774 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1775 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1778 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1780 const int x = 2*(i%2);
1781 const int y = 2*(i/2);
1783 switch( h->mb.i_sub_partition[i] )
1786 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1789 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1790 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1793 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1794 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1797 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1798 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1799 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1800 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1803 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1808 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1810 const int x = 2*(idx&1);
1811 const int y = 2*(idx>>1);
1812 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1813 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1814 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1815 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1818 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1819 if( x264_mb_partition_listX_table[0][part] ) \
1821 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1822 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1826 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1827 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1829 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1831 if( x264_mb_partition_listX_table[1][part] ) \
1833 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1834 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1838 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1839 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1841 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1844 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1848 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1850 x264_mb_load_mv_direct8x8( h, i );
1853 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1854 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1855 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1860 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1863 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1865 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1867 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1869 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1873 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1875 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1876 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1878 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1879 * than those used by the neighbors */
1880 #define CHECK_NEIGHBOUR(i)\
1882 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1883 if( ref > i_maxref[l] )\
1887 for( int l = 0; l < 2; l++ )
1889 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1890 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1891 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1894 CHECK_NEIGHBOUR( -8 - 1 );
1895 CHECK_NEIGHBOUR( -8 + 0 );
1896 CHECK_NEIGHBOUR( -8 + 2 );
1897 CHECK_NEIGHBOUR( -8 + 4 );
1898 CHECK_NEIGHBOUR( 0 - 1 );
1899 CHECK_NEIGHBOUR( 2*8 - 1 );
1903 /* XXX Needed for x264_mb_predict_mv */
1904 h->mb.i_partition = D_8x8;
1908 for( int i = 0; i < 4; i++ )
1914 int stride[2] = {8,8};
1917 m.i_pixel = PIXEL_8x8;
1918 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1920 for( int l = 0; l < 2; l++ )
1922 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1924 lX->me8x8[i].cost = INT_MAX;
1925 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1927 m.i_ref_cost = REF_COST( l, i_ref );;
1929 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1931 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1932 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1933 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1934 m.cost += m.i_ref_cost;
1936 if( m.cost < lX->me8x8[i].cost )
1938 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1939 a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
1942 /* save mv for predicting other partitions within this MB */
1943 CP32( lX->mvc[i_ref][i+1], m.mv );
1948 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1949 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1950 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1951 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1952 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1953 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1955 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1956 i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
1957 + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
1958 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1960 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1961 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1963 i_part_cost = a->l0.me8x8[i].cost;
1964 h->mb.i_sub_partition[i] = D_L0_8x8;
1965 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1966 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1967 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1968 a->i_cost8x8bi += i_part_cost;
1970 /* XXX Needed for x264_mb_predict_mv */
1971 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1975 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1978 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1980 uint8_t **p_fref[2] =
1981 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
1982 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
1983 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1985 /* XXX Needed for x264_mb_predict_mv */
1986 h->mb.i_partition = D_8x8;
1990 for( int i = 0; i < 4; i++ )
1995 int i_part_cost_bi = 0;
1996 int stride[2] = {8,8};
1999 for( int l = 0; l < 2; l++ )
2001 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2002 x264_me_t *m = &lX->me8x8[i];
2003 m->i_pixel = PIXEL_8x8;
2004 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2006 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2007 m->i_ref = lX->me16x16.i_ref;
2009 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2011 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2012 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2013 x264_me_search( h, m, &lX->me16x16.mv, 1 );
2014 a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2015 m->cost += m->i_ref_cost;
2017 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2019 /* save mv for predicting other partitions within this MB */
2020 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2023 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2024 m->mv[0], m->mv[1], 8, 8, weight_none );
2025 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2027 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2028 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2029 i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2030 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2031 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2033 i_part_cost = a->l0.me8x8[i].cost;
2034 h->mb.i_sub_partition[i] = D_L0_8x8;
2035 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2036 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2037 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2038 a->i_cost8x8bi += i_part_cost;
2040 /* XXX Needed for x264_mb_predict_mv */
2041 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2045 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2048 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2050 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
2051 ALIGNED_4( int16_t mvc[3][2] );
2053 h->mb.i_partition = D_16x8;
2054 a->i_cost16x8bi = 0;
2056 for( int i = 0; i < 2; i++ )
2059 int i_part_cost_bi = 0;
2060 int stride[2] = {16,16};
2063 m.i_pixel = PIXEL_16x8;
2064 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2066 for( int l = 0; l < 2; l++ )
2068 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2069 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2070 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2071 lX->me16x8[i].cost = INT_MAX;
2072 for( int j = 0; j < i_ref8s; j++ )
2074 int i_ref = ref8[j];
2075 m.i_ref_cost = REF_COST( l, i_ref );;
2077 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2079 CP32( mvc[0], lX->mvc[i_ref][0] );
2080 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2081 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2083 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2084 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2085 x264_me_search( h, &m, mvc, 3 );
2086 m.cost += m.i_ref_cost;
2088 if( m.cost < lX->me16x8[i].cost )
2089 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2094 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2095 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2096 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2097 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2098 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2099 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2101 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2102 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2103 + a->l1.me16x8[i].i_ref_cost;
2105 i_part_cost = a->l0.me16x8[i].cost;
2106 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2108 if( a->l1.me16x8[i].cost < i_part_cost )
2110 i_part_cost = a->l1.me16x8[i].cost;
2111 a->i_mb_partition16x8[i] = D_L1_8x8;
2113 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2115 i_part_cost = i_part_cost_bi;
2116 a->i_mb_partition16x8[i] = D_BI_8x8;
2118 a->i_cost16x8bi += i_part_cost;
2120 /* Early termination based on the current SATD score of partition[0]
2121 plus the estimated SATD score of partition[1] */
2122 if( !i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2123 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2125 a->i_cost16x8bi = COST_MAX;
2129 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2133 a->i_mb_type16x8 = B_L0_L0
2134 + (a->i_mb_partition16x8[0]>>2) * 3
2135 + (a->i_mb_partition16x8[1]>>2);
2136 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2139 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2141 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2142 ALIGNED_4( int16_t mvc[3][2] );
2144 h->mb.i_partition = D_8x16;
2145 a->i_cost8x16bi = 0;
2147 for( int i = 0; i < 2; i++ )
2150 int i_part_cost_bi = 0;
2151 int stride[2] = {8,8};
2154 m.i_pixel = PIXEL_8x16;
2155 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2157 for( int l = 0; l < 2; l++ )
2159 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2160 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2161 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2162 lX->me8x16[i].cost = INT_MAX;
2163 for( int j = 0; j < i_ref8s; j++ )
2165 int i_ref = ref8[j];
2166 m.i_ref_cost = REF_COST( l, i_ref );
2168 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2170 CP32( mvc[0], lX->mvc[i_ref][0] );
2171 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2172 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2174 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2175 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2176 x264_me_search( h, &m, mvc, 3 );
2177 m.cost += m.i_ref_cost;
2179 if( m.cost < lX->me8x16[i].cost )
2180 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2185 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2186 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2187 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2188 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2189 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2191 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2192 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2193 + a->l1.me8x16[i].i_ref_cost;
2195 i_part_cost = a->l0.me8x16[i].cost;
2196 a->i_mb_partition8x16[i] = D_L0_8x8;
2198 if( a->l1.me8x16[i].cost < i_part_cost )
2200 i_part_cost = a->l1.me8x16[i].cost;
2201 a->i_mb_partition8x16[i] = D_L1_8x8;
2203 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2205 i_part_cost = i_part_cost_bi;
2206 a->i_mb_partition8x16[i] = D_BI_8x8;
2208 a->i_cost8x16bi += i_part_cost;
2210 /* Early termination based on the current SATD score of partition[0]
2211 plus the estimated SATD score of partition[1] */
2212 if( !i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2213 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2215 a->i_cost8x16bi = COST_MAX;
2219 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2223 a->i_mb_type8x16 = B_L0_L0
2224 + (a->i_mb_partition8x16[0]>>2) * 3
2225 + (a->i_mb_partition8x16[1]>>2);
2226 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2229 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2231 int thresh = i_satd * 5/4;
2233 h->mb.i_type = P_L0;
2234 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2236 h->mb.i_partition = D_16x16;
2237 x264_analyse_update_cache( h, a );
2238 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2241 if( a->l0.i_cost16x8 <= thresh )
2243 h->mb.i_partition = D_16x8;
2244 x264_analyse_update_cache( h, a );
2245 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2248 a->l0.i_cost16x8 = COST_MAX;
2250 if( a->l0.i_cost8x16 <= thresh )
2252 h->mb.i_partition = D_8x16;
2253 x264_analyse_update_cache( h, a );
2254 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2257 a->l0.i_cost8x16 = COST_MAX;
2259 if( a->l0.i_cost8x8 <= thresh )
2261 h->mb.i_type = P_8x8;
2262 h->mb.i_partition = D_8x8;
2263 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2265 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2266 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2267 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2268 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2269 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2270 * for future blocks are those left over from previous RDO calls. */
2271 for( int i = 0; i < 4; i++ )
2273 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2274 int sub8x8_thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2275 int subtype, btype = D_L0_8x8;
2276 uint64_t bcost = COST_MAX64;
2277 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2280 if( costs[subtype] > sub8x8_thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2282 h->mb.i_sub_partition[i] = subtype;
2283 x264_mb_cache_mv_p8x8( h, a, i );
2284 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2285 COPY2_IF_LT( bcost, cost, btype, subtype );
2287 if( h->mb.i_sub_partition[i] != btype )
2289 h->mb.i_sub_partition[i] = btype;
2290 x264_mb_cache_mv_p8x8( h, a, i );
2295 x264_analyse_update_cache( h, a );
2296 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2299 a->l0.i_cost8x8 = COST_MAX;
2302 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2304 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2306 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2308 h->mb.i_type = B_DIRECT;
2309 /* Assumes direct/skip MC is still in fdec */
2310 /* Requires b-rdo to be done before intra analysis */
2311 h->mb.b_skip_mc = 1;
2312 x264_analyse_update_cache( h, a );
2313 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2314 h->mb.b_skip_mc = 0;
2317 //FIXME not all the update_cache calls are needed
2318 h->mb.i_partition = D_16x16;
2320 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2322 h->mb.i_type = B_L0_L0;
2323 x264_analyse_update_cache( h, a );
2324 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2328 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2330 h->mb.i_type = B_L1_L1;
2331 x264_analyse_update_cache( h, a );
2332 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2336 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2338 h->mb.i_type = B_BI_BI;
2339 x264_analyse_update_cache( h, a );
2340 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2344 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2346 h->mb.i_type = B_8x8;
2347 h->mb.i_partition = D_8x8;
2348 x264_analyse_update_cache( h, a );
2349 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2350 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2354 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2356 h->mb.i_type = a->i_mb_type16x8;
2357 h->mb.i_partition = D_16x8;
2358 x264_analyse_update_cache( h, a );
2359 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2363 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2365 h->mb.i_type = a->i_mb_type8x16;
2366 h->mb.i_partition = D_8x16;
2367 x264_analyse_update_cache( h, a );
2368 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2372 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2376 if( IS_INTRA(h->mb.i_type) )
2379 switch( h->mb.i_partition )
2382 if( h->mb.i_type == B_BI_BI )
2384 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2385 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2389 for( int i = 0; i < 2; i++ )
2390 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2392 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2393 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2397 for( int i = 0; i < 2; i++ )
2398 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2400 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2401 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2405 for( int i = 0; i < 4; i++ )
2406 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2408 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2409 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2415 static inline void x264_mb_analyse_transform( x264_t *h )
2417 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2419 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2422 int i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2423 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2424 int i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2425 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2427 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2428 h->mb.b_skip_mc = 1;
2432 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2434 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2436 x264_analyse_update_cache( h, a );
2437 h->mb.b_transform_8x8 ^= 1;
2438 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2439 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2441 if( *i_rd >= i_rd8 )
2444 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2448 h->mb.b_transform_8x8 ^= 1;
2452 /* Rate-distortion optimal QP selection.
2453 * FIXME: More than half of the benefit of this function seems to be
2454 * in the way it improves the coding of chroma DC (by decimating or
2455 * finding a better way to code a single DC coefficient.)
2456 * There must be a more efficient way to get that portion of the benefit
2457 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2459 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2461 int bcost, cost, failures, prevcost, origcost;
2462 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2463 int last_qp_tried = 0;
2464 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2465 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2467 /* If CBP is already zero, don't raise the quantizer any higher. */
2468 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2470 /* Without psy-RD, require monotonicity when moving quant away from previous
2471 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2472 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2473 * allow 2 failures when moving quant towards previous quant.
2474 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2475 int threshold = (!!h->mb.i_psy_rd);
2476 /* Raise the threshold for failures if we're moving towards the last QP. */
2477 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2478 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2480 h->mb.i_qp = orig_qp;
2482 prevcost = origcost;
2484 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2485 * (up to a point) will too. So, jump down to where the threshold will kick in
2486 * and check the QP there. If the CBP is still empty, skip the main loop.
2487 * If it isn't empty, we would have ended up having to check this QP anyways,
2488 * so as long as we store it for later lookup, we lose nothing. */
2489 int already_checked_qp = -1;
2490 int already_checked_cost = COST_MAX;
2491 if( direction == -1 )
2495 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2496 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2497 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2498 if( !h->mb.cbp[h->mb.i_mb_xy] )
2500 /* If our empty-CBP block is lower QP than the last QP,
2501 * the last QP almost surely doesn't have a CBP either. */
2502 if( h->mb.i_last_qp > h->mb.i_qp )
2506 already_checked_qp = h->mb.i_qp;
2507 h->mb.i_qp = orig_qp;
2511 h->mb.i_qp += direction;
2512 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2514 if( h->mb.i_last_qp == h->mb.i_qp )
2516 if( h->mb.i_qp == already_checked_qp )
2517 cost = already_checked_cost;
2520 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2521 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2522 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2525 /* We can't assume that the costs are monotonic over QPs.
2526 * Tie case-as-failure seems to give better results. */
2527 if( cost < prevcost )
2533 if( failures > threshold )
2535 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2537 h->mb.i_qp += direction;
2541 /* Always try the last block's QP. */
2542 if( !last_qp_tried )
2544 h->mb.i_qp = h->mb.i_last_qp;
2545 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2546 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2547 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2551 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2553 /* Check transform again; decision from before may no longer be optimal. */
2554 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2555 x264_mb_transform_8x8_allowed( h ) )
2557 h->mb.b_transform_8x8 ^= 1;
2558 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2560 h->mb.b_transform_8x8 ^= 1;
2564 /*****************************************************************************
2565 * x264_macroblock_analyse:
2566 *****************************************************************************/
2567 void x264_macroblock_analyse( x264_t *h )
2569 x264_mb_analysis_t analysis;
2570 int i_cost = COST_MAX;
2572 h->mb.i_qp = x264_ratecontrol_qp( h );
2573 if( h->param.rc.i_aq_mode )
2575 x264_adaptive_quant( h );
2576 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2577 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2578 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2579 h->mb.i_qp = h->mb.i_last_qp;
2582 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2584 /*--------------------------- Do the analysis ---------------------------*/
2585 if( h->sh.i_type == SLICE_TYPE_I )
2588 if( analysis.i_mbrd )
2589 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2590 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2591 if( analysis.i_mbrd )
2592 x264_intra_rd( h, &analysis, COST_MAX );
2594 i_cost = analysis.i_satd_i16x16;
2595 h->mb.i_type = I_16x16;
2596 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2597 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2598 if( analysis.i_satd_pcm < i_cost )
2599 h->mb.i_type = I_PCM;
2601 else if( analysis.i_mbrd >= 2 )
2602 x264_intra_rd_refine( h, &analysis );
2604 else if( h->sh.i_type == SLICE_TYPE_P )
2608 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2610 analysis.b_try_skip = 0;
2611 if( analysis.b_force_intra )
2613 if( !h->param.analyse.b_psy )
2615 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2616 goto intra_analysis;
2621 /* Fast P_SKIP detection */
2622 if( h->param.analyse.b_fast_pskip )
2624 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2625 // FIXME don't need to check this if the reference frame is done
2627 else if( h->param.analyse.i_subpel_refine >= 3 )
2628 analysis.b_try_skip = 1;
2629 else if( h->mb.i_mb_type_left == P_SKIP ||
2630 h->mb.i_mb_type_top == P_SKIP ||
2631 h->mb.i_mb_type_topleft == P_SKIP ||
2632 h->mb.i_mb_type_topright == P_SKIP )
2633 b_skip = x264_macroblock_probe_pskip( h );
2637 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2641 h->mb.i_type = P_SKIP;
2642 h->mb.i_partition = D_16x16;
2643 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2644 /* Set up MVs for future predictors */
2646 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2647 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2651 const unsigned int flags = h->param.analyse.inter;
2655 int i_satd_inter, i_satd_intra;
2657 x264_mb_analyse_load_costs( h, &analysis );
2659 x264_mb_analyse_inter_p16x16( h, &analysis );
2661 if( h->mb.i_type == P_SKIP )
2663 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2664 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2668 if( flags & X264_ANALYSE_PSUB16x16 )
2670 if( h->param.analyse.b_mixed_references )
2671 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2673 x264_mb_analyse_inter_p8x8( h, &analysis );
2676 /* Select best inter mode */
2678 i_partition = D_16x16;
2679 i_cost = analysis.l0.me16x16.cost;
2681 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2682 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2685 i_partition = D_8x8;
2686 i_cost = analysis.l0.i_cost8x8;
2689 if( flags & X264_ANALYSE_PSUB8x8 )
2691 for( int i = 0; i < 4; i++ )
2693 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2694 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2696 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2697 h->mb.i_sub_partition[i] = D_L0_4x4;
2699 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2700 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2701 h->mb.i_sub_partition[i], D_L0_8x4 );
2703 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2704 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2705 h->mb.i_sub_partition[i], D_L0_4x8 );
2707 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2709 x264_mb_cache_mv_p8x8( h, &analysis, i );
2711 analysis.l0.i_cost8x8 = i_cost;
2715 /* Now do 16x8/8x16 */
2716 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2717 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2718 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2720 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
2721 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2722 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2724 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
2725 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2727 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
2728 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2729 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2731 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
2732 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2735 h->mb.i_partition = i_partition;
2738 //FIXME mb_type costs?
2739 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2743 else if( i_partition == D_16x16 )
2745 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2746 i_cost = analysis.l0.me16x16.cost;
2748 else if( i_partition == D_16x8 )
2750 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2751 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2752 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2754 else if( i_partition == D_8x16 )
2756 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2757 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2758 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2760 else if( i_partition == D_8x8 )
2763 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2765 switch( h->mb.i_sub_partition[i8x8] )
2768 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2769 i_cost += analysis.l0.me8x8[i8x8].cost;
2772 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2773 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2774 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2775 analysis.l0.me8x4[i8x8][1].cost;
2778 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2779 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2780 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2781 analysis.l0.me4x8[i8x8][1].cost;
2785 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2786 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2787 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2788 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2789 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2790 analysis.l0.me4x4[i8x8][1].cost +
2791 analysis.l0.me4x4[i8x8][2].cost +
2792 analysis.l0.me4x4[i8x8][3].cost;
2795 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2801 if( h->mb.b_chroma_me )
2803 x264_mb_analyse_intra_chroma( h, &analysis );
2804 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2805 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2806 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2807 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2810 x264_mb_analyse_intra( h, &analysis, i_cost );
2812 i_satd_inter = i_cost;
2813 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2814 analysis.i_satd_i8x8,
2815 analysis.i_satd_i4x4 );
2817 if( analysis.i_mbrd )
2819 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2821 i_partition = D_16x16;
2822 i_cost = analysis.l0.i_rd16x16;
2823 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2824 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2825 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2826 h->mb.i_type = i_type;
2827 h->mb.i_partition = i_partition;
2828 if( i_cost < COST_MAX )
2829 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2830 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2833 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2834 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2835 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2836 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2838 h->mb.i_type = i_type;
2840 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2842 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2843 * it was an inter block. */
2844 x264_analyse_update_cache( h, &analysis );
2845 x264_macroblock_encode( h );
2846 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2847 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2848 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2849 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2850 goto intra_analysis;
2853 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2855 if( IS_INTRA( h->mb.i_type ) )
2857 x264_intra_rd_refine( h, &analysis );
2859 else if( i_partition == D_16x16 )
2861 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2862 analysis.l0.me16x16.cost = i_cost;
2863 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2865 else if( i_partition == D_16x8 )
2867 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2868 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2869 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2870 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2871 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2872 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2874 else if( i_partition == D_8x16 )
2876 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2877 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2878 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2879 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2880 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2881 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2883 else if( i_partition == D_8x8 )
2885 x264_analyse_update_cache( h, &analysis );
2886 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2888 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2890 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2892 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2894 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2895 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2897 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2899 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2900 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2902 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2904 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2905 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2906 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2907 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2914 else if( h->sh.i_type == SLICE_TYPE_B )
2916 int i_bskip_cost = COST_MAX;
2919 if( analysis.i_mbrd )
2920 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2922 h->mb.i_type = B_SKIP;
2923 if( h->mb.b_direct_auto_write )
2925 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2926 for( int i = 0; i < 2; i++ )
2929 h->sh.b_direct_spatial_mv_pred ^= 1;
2930 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2931 if( analysis.b_direct_available )
2936 b_skip = x264_macroblock_probe_bskip( h );
2938 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2945 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2947 analysis.b_try_skip = 0;
2948 if( analysis.b_direct_available )
2950 if( !h->mb.b_direct_auto_write )
2952 if( analysis.i_mbrd )
2954 i_bskip_cost = ssd_mb( h );
2955 /* 6 = minimum cavlc cost of a non-skipped MB */
2956 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2958 else if( !h->mb.b_direct_auto_write )
2960 /* Conditioning the probe on neighboring block types
2961 * doesn't seem to help speed or quality. */
2962 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
2963 if( h->param.analyse.i_subpel_refine < 3 )
2964 b_skip = analysis.b_try_skip;
2966 /* Set up MVs for future predictors */
2969 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2970 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2971 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
2972 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
2978 const unsigned int flags = h->param.analyse.inter;
2982 h->mb.b_skip_mc = 0;
2983 h->mb.i_type = B_DIRECT;
2985 x264_mb_analyse_load_costs( h, &analysis );
2987 /* select best inter mode */
2988 /* direct must be first */
2989 if( analysis.b_direct_available )
2990 x264_mb_analyse_inter_direct( h, &analysis );
2992 x264_mb_analyse_inter_b16x16( h, &analysis );
2994 if( h->mb.i_type == B_SKIP )
2996 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2997 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2998 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
2999 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3004 i_partition = D_16x16;
3005 i_cost = analysis.l0.me16x16.cost;
3006 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3007 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3008 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3010 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
3012 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3013 if( i_bskip_cost < analysis.i_rd16x16direct &&
3014 i_bskip_cost < analysis.i_rd16x16bi &&
3015 i_bskip_cost < analysis.l0.i_rd16x16 &&
3016 i_bskip_cost < analysis.l1.i_rd16x16 )
3018 h->mb.i_type = B_SKIP;
3019 x264_analyse_update_cache( h, &analysis );
3024 if( flags & X264_ANALYSE_BSUB16x16 )
3026 if( h->param.analyse.b_mixed_references )
3027 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3029 x264_mb_analyse_inter_b8x8( h, &analysis );
3031 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3033 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3034 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3035 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3036 for( int i = 0; i < 2; i++ )
3038 int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3039 int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3041 i_best_cost = COST_MAX;
3042 i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3043 i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3044 i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3045 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3046 + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3047 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3048 + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3049 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3050 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3051 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3052 analysis.i_cost_est16x8[i] = i_best_cost;
3055 i_best_cost = COST_MAX;
3056 i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3057 i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3058 i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3059 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3060 + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3061 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3062 + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3063 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3064 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3065 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3066 analysis.i_cost_est8x16[i] = i_best_cost;
3068 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3069 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3070 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3071 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3072 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3073 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3075 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3076 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3077 if( try_16x8_first && i_cost_est16x8bi_total < i_cost )
3079 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3080 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3082 if( i_cost_est8x16bi_total < i_cost )
3084 x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3085 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3087 if( !try_16x8_first && i_cost_est16x8bi_total < i_cost )
3089 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3090 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3094 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3099 else if( i_partition == D_16x16 )
3101 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3102 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3103 if( i_type == B_L0_L0 )
3105 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3106 i_cost = analysis.l0.me16x16.cost
3107 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3109 else if( i_type == B_L1_L1 )
3111 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3112 i_cost = analysis.l1.me16x16.cost
3113 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3115 else if( i_type == B_BI_BI )
3117 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3118 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3121 else if( i_partition == D_16x8 )
3123 for( int i = 0; i < 2; i++ )
3125 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3126 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3127 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3128 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3131 else if( i_partition == D_8x16 )
3133 for( int i = 0; i < 2; i++ )
3135 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3136 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3137 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3138 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3141 else if( i_partition == D_8x8 )
3143 for( int i = 0; i < 4; i++ )
3146 int i_part_cost_old;
3148 int i_part_type = h->mb.i_sub_partition[i];
3149 int b_bidir = (i_part_type == D_BI_8x8);
3151 if( i_part_type == D_DIRECT_8x8 )
3153 if( x264_mb_partition_listX_table[0][i_part_type] )
3155 m = &analysis.l0.me8x8[i];
3156 i_part_cost_old = m->cost;
3157 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3158 m->cost -= i_type_cost;
3159 x264_me_refine_qpel( h, m );
3161 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3163 if( x264_mb_partition_listX_table[1][i_part_type] )
3165 m = &analysis.l1.me8x8[i];
3166 i_part_cost_old = m->cost;
3167 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3168 m->cost -= i_type_cost;
3169 x264_me_refine_qpel( h, m );
3171 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3173 /* TODO: update mvp? */
3177 i_satd_inter = i_cost;
3179 if( analysis.i_mbrd )
3181 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3183 i_cost = i_bskip_cost;
3184 i_partition = D_16x16;
3185 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3186 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3187 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3188 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3189 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3190 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3191 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3193 h->mb.i_type = i_type;
3194 h->mb.i_partition = i_partition;
3197 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3199 if( analysis.i_mbrd )
3201 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3202 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3205 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3206 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3207 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3208 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3210 h->mb.i_type = i_type;
3211 h->mb.i_partition = i_partition;
3213 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3214 x264_intra_rd_refine( h, &analysis );
3215 if( h->mb.i_subpel_refine >= 5 )
3216 x264_refine_bidir( h, &analysis );
3218 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3221 x264_analyse_update_cache( h, &analysis );
3223 if( i_partition == D_16x16 )
3225 if( i_type == B_L0_L0 )
3227 analysis.l0.me16x16.cost = i_cost;
3228 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3230 else if( i_type == B_L1_L1 )
3232 analysis.l1.me16x16.cost = i_cost;
3233 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3235 else if( i_type == B_BI_BI )
3237 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3238 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3241 else if( i_partition == D_16x8 )
3243 for( int i = 0; i < 2; i++ )
3245 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3246 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3247 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3248 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3249 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3250 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3252 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3253 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3257 else if( i_partition == D_8x16 )
3259 for( int i = 0; i < 2; i++ )
3261 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3262 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3263 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3264 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3265 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3266 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3268 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3269 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3273 else if( i_partition == D_8x8 )
3275 for( int i = 0; i < 4; i++ )
3277 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3278 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3279 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3280 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3281 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3283 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3284 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3292 x264_analyse_update_cache( h, &analysis );
3294 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3295 * without realizing it. Check for this and account for it if necessary. */
3296 if( analysis.i_mbrd >= 2 )
3298 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3299 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3300 int list = check_mv_lists[h->mb.i_type] - 1;
3301 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3302 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3303 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3304 h->mb.i_partition = D_16x16;
3307 if( !analysis.i_mbrd )
3308 x264_mb_analyse_transform( h );
3310 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3311 x264_mb_analyse_qp_rd( h, &analysis );
3313 h->mb.b_trellis = h->param.analyse.i_trellis;
3314 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3315 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3316 x264_psy_trellis_init( h, 0 );
3317 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3318 h->mb.i_skip_intra = 0;
3321 /*-------------------- Update MB from the analysis ----------------------*/
3322 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3324 switch( h->mb.i_type )
3327 for( int i = 0; i < 16; i++ )
3328 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3330 x264_mb_analyse_intra_chroma( h, a );
3333 for( int i = 0; i < 4; i++ )
3334 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3336 x264_mb_analyse_intra_chroma( h, a );
3339 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3340 x264_mb_analyse_intra_chroma( h, a );
3347 switch( h->mb.i_partition )
3350 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3351 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3355 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3356 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3357 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3358 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3362 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3363 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3364 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3365 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3369 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3375 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3376 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3377 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3378 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3379 for( int i = 0; i < 4; i++ )
3380 x264_mb_cache_mv_p8x8( h, a, i );
3385 h->mb.i_partition = D_16x16;
3386 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3387 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3393 h->mb.i_partition = h->mb.cache.direct_partition;
3394 x264_mb_load_mv_direct8x8( h, 0 );
3395 x264_mb_load_mv_direct8x8( h, 1 );
3396 x264_mb_load_mv_direct8x8( h, 2 );
3397 x264_mb_load_mv_direct8x8( h, 3 );
3401 /* optimize: cache might not need to be rewritten */
3402 for( int i = 0; i < 4; i++ )
3403 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3406 default: /* the rest of the B types */
3407 switch( h->mb.i_partition )
3410 switch( h->mb.i_type )
3413 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3414 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3416 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3417 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3418 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3421 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3422 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3423 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3425 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3426 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3429 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3430 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3432 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3433 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3438 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3439 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3442 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3443 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3446 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3452 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3454 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3457 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3460 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3461 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3463 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3464 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3465 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3466 h->mb.cache.mv[l][x264_scan8[15]][0],
3467 h->mb.cache.mv[l][x264_scan8[15]][1] );
3468 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3469 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3470 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3471 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3472 x264_mb_analyse_intra( h, a, COST_MAX );
3473 h->mb.i_type = I_16x16;
3474 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3475 x264_mb_analyse_intra_chroma( h, a );
3482 #include "slicetype.c"