1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[7];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
125 int i_mb_partition16x8[2]; /* mb_partition_e */
126 int i_mb_partition8x16[2];
127 int i_mb_type16x8; /* mb_class_e */
130 int b_direct_available;
132 } x264_mb_analysis_t;
134 /* lambda = pow(2,qp/6-2) */
135 const uint8_t x264_lambda_tab[52] = {
136 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
137 1, 1, 1, 1, /* 8-11 */
138 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
139 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
140 6, 7, 8, 9,10,11,13,14, /* 28-35 */
141 16,18,20,23,25,29,32,36, /* 36-43 */
142 40,45,51,57,64,72,81,91 /* 44-51 */
145 /* lambda2 = pow(lambda,2) * .9 * 256 */
146 const int x264_lambda2_tab[52] = {
147 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
148 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
149 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
150 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
151 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
152 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
153 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
156 const uint8_t x264_exp2_lut[64] = {
157 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
158 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
159 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
160 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
163 const float x264_log2_lut[128] = {
164 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
165 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
166 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
167 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
168 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
169 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
170 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
171 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
172 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
173 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
174 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
175 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
176 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
177 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
178 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
179 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
182 /* Avoid an int/float conversion. */
183 const float x264_log2_lz_lut[32] = {
184 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
187 // should the intra and inter lambdas be different?
188 // I'm just matching the behaviour of deadzone quant.
189 static const int x264_trellis_lambda2_tab[2][52] = {
190 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
191 { 46, 58, 73, 92, 117, 147,
192 185, 233, 294, 370, 466, 587,
193 740, 932, 1174, 1480, 1864, 2349,
194 2959, 3728, 4697, 5918, 7457, 9395,
195 11837, 14914, 18790, 23674, 29828, 37581,
196 47349, 59656, 75163, 94699, 119313, 150326,
197 189399, 238627, 300652, 378798, 477255, 601304,
198 757596, 954511, 1202608, 1515192, 1909022, 2405217,
199 3030384, 3818045, 4810435, 6060769 },
200 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
201 { 27, 34, 43, 54, 68, 86,
202 108, 136, 172, 216, 273, 343,
203 433, 545, 687, 865, 1090, 1374,
204 1731, 2180, 2747, 3461, 4361, 5494,
205 6922, 8721, 10988, 13844, 17442, 21976,
206 27688, 34885, 43953, 55377, 69771, 87906,
207 110755, 139543, 175813, 221511, 279087, 351627,
208 443023, 558174, 703255, 886046, 1116348, 1406511,
209 1772093, 2232697, 2813022, 3544186 }
212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
213 16, 20, 25, 32, 40, 50,
214 64, 80, 101, 128, 161, 203,
215 256, 322, 406, 512, 645, 812,
216 1024, 1290, 1625, 2048, 2580, 3250,
217 4096, 5160, 6501, 8192, 10321, 13003,
218 16384, 20642, 26007, 32768, 41285, 52015,
222 /* TODO: calculate CABAC costs */
223 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
224 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
226 static const uint8_t i_mb_b16x8_cost_table[17] = {
227 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
229 static const uint8_t i_sub_mb_b_cost_table[13] = {
230 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
232 static const uint8_t i_sub_mb_p_cost_table[4] = {
236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
238 static uint16_t x264_cost_ref[92][3][33];
239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
241 int x264_analyse_init_costs( x264_t *h, int qp )
243 int lambda = x264_lambda_tab[qp];
244 if( h->cost_mv[lambda] )
246 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
247 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
248 h->cost_mv[lambda] += 2*4*2048;
249 for( int i = 0; i <= 2*4*2048; i++ )
251 h->cost_mv[lambda][-i] =
252 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
254 x264_pthread_mutex_lock( &cost_ref_mutex );
255 for( int i = 0; i < 3; i++ )
256 for( int j = 0; j < 33; j++ )
257 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
258 x264_pthread_mutex_unlock( &cost_ref_mutex );
259 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
261 for( int j = 0; j < 4; j++ )
263 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
264 h->cost_mv_fpel[lambda][j] += 2*2048;
265 for( int i = -2*2048; i < 2*2048; i++ )
266 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
274 void x264_analyse_free_costs( x264_t *h )
276 for( int i = 0; i < 92; i++ )
279 x264_free( h->cost_mv[i] - 2*4*2048 );
280 if( h->cost_mv_fpel[i][0] )
281 for( int j = 0; j < 4; j++ )
282 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
286 void x264_analyse_weight_frame( x264_t *h, int end )
288 for( int j = 0; j < h->i_ref0; j++ )
290 if( h->sh.weight[j][0].weightfn )
292 x264_frame_t *frame = h->fref0[j];
293 int width = frame->i_width[0] + 2*PADH;
294 int i_padv = PADV << h->param.b_interlaced;
296 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
297 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
298 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
299 h->fenc->i_lines_weighted += height;
301 for( int k = j; k < h->i_ref0; k++ )
302 if( h->sh.weight[k][0].weightfn )
304 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
305 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
306 src + offset, frame->i_stride[0],
307 width, height, &h->sh.weight[k][0] );
314 /* initialize an array of lambda*nbits for all possible mvs */
315 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
317 a->p_cost_mv = h->cost_mv[a->i_lambda];
318 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
319 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
322 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
324 /* conduct the analysis using this lamda and QP */
325 a->i_qp = h->mb.i_qp = i_qp;
326 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
328 a->i_lambda = x264_lambda_tab[i_qp];
329 a->i_lambda2 = x264_lambda2_tab[i_qp];
331 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
332 if( h->param.analyse.i_trellis )
334 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
335 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
336 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
337 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
339 h->mb.i_psy_rd_lambda = a->i_lambda;
340 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
341 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
344 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
346 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
348 /* mbrd == 1 -> RD mode decision */
349 /* mbrd == 2 -> RD refinement */
350 /* mbrd == 3 -> QPRD */
351 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
353 x264_mb_analyse_init_qp( h, a, i_qp );
355 h->mb.b_transform_8x8 = 0;
356 h->mb.b_noise_reduction = 0;
362 a->i_satd_i8x8chroma = COST_MAX;
364 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
365 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
369 h->mb.b_lossless ? 0 :
371 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
373 /* II: Inter part P/B frame */
374 if( h->sh.i_type != SLICE_TYPE_I )
376 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
377 // limit motion search to a slightly smaller range than the theoretical limit,
378 // since the search may go a few iterations past its given range
379 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
381 /* Calculate max allowed MV range */
382 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
383 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
384 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
385 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
386 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
387 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
389 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
390 int max_mv = max_x - 4*16*h->mb.i_mb_x;
391 /* If we're left of the refresh bar, don't reference right of it. */
392 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
393 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
395 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
396 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
397 if( h->mb.i_mb_x == 0 )
399 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
400 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
401 int thread_mvy_range = i_fmv_range;
403 if( h->i_thread_frames > 1 )
405 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
406 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
407 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
409 x264_frame_t **fref = i ? h->fref1 : h->fref0;
410 int i_ref = i ? h->i_ref1 : h->i_ref0;
411 for( int j = 0; j < i_ref; j++ )
413 x264_frame_cond_wait( fref[j]->orig, thresh );
414 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
418 if( h->param.b_deterministic )
419 thread_mvy_range = h->param.analyse.i_mv_range_thread;
420 if( h->mb.b_interlaced )
421 thread_mvy_range >>= 1;
423 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
426 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
427 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
428 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
429 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
430 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
431 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
432 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
440 a->l0.i_cost8x16 = COST_MAX;
441 if( h->sh.i_type == SLICE_TYPE_B )
446 a->i_cost8x8direct[0] =
447 a->i_cost8x8direct[1] =
448 a->i_cost8x8direct[2] =
449 a->i_cost8x8direct[3] =
458 a->i_cost16x16direct =
461 a->i_cost8x16bi = COST_MAX;
463 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
464 for( int i = 0; i < 4; i++ )
468 a->l0.i_cost4x8[i] = COST_MAX;
471 /* Fast intra decision */
472 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
474 /* Always run in fast-intra mode for subme < 3 */
475 if( h->mb.i_subpel_refine > 2 &&
476 ( IS_INTRA( h->mb.i_mb_type_left ) ||
477 IS_INTRA( h->mb.i_mb_type_top ) ||
478 IS_INTRA( h->mb.i_mb_type_topleft ) ||
479 IS_INTRA( h->mb.i_mb_type_topright ) ||
480 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] )) ||
481 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
482 { /* intra is likely */ }
489 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
490 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
492 a->b_force_intra = 1;
496 a->b_force_intra = 0;
500 /* Prediction modes allowed for various combinations of neighbors. */
501 /* Terminated by a -1. */
502 /* In order, no neighbors, left, top, top/left, top/left/topleft */
503 static const int8_t i16x16_mode_available[5][5] =
505 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
506 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
507 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
508 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
509 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
512 static const int8_t i8x8chroma_mode_available[5][5] =
514 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
515 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
516 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
517 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
518 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
521 static const int8_t i4x4_mode_available[5][10] =
523 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
524 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
525 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
526 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
527 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
530 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
532 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
533 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
536 static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
538 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
539 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
542 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour )
544 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
545 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
548 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
549 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
551 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
553 if( do_both_dct || h->mb.b_transform_8x8 )
554 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
555 if( do_both_dct || !h->mb.b_transform_8x8 )
556 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
559 /* Reset fenc satd scores cache for psy RD */
560 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
562 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
563 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
564 if( !h->mb.i_psy_rd )
566 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
567 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
569 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
572 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
574 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
576 if( a->i_satd_i8x8chroma < COST_MAX )
579 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
581 /* 8x8 prediction selection for chroma */
582 if( predict_mode[3] >= 0 && b_merged_satd )
584 int satdu[4], satdv[4];
585 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
586 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
587 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
588 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
589 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
590 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
592 for( ; *predict_mode >= 0; predict_mode++ )
594 int i_mode = *predict_mode;
595 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
597 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
598 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
603 for( ; *predict_mode >= 0; predict_mode++ )
606 int i_mode = *predict_mode;
608 /* we do the prediction */
609 if( h->mb.b_lossless )
610 x264_predict_lossless_8x8_chroma( h, i_mode );
613 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
614 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
617 /* we calculate the cost */
618 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
619 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
620 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
622 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
623 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
627 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
630 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
632 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
633 uint8_t *p_src = h->mb.pic.p_fenc[0];
634 uint8_t *p_dst = h->mb.pic.p_fdec[0];
637 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
639 /*---------------- Try all mode and calculate their score ---------------*/
641 /* 16x16 prediction selection */
642 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
644 if( b_merged_satd && predict_mode[3] >= 0 )
646 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
647 h->predict_16x16[I_PRED_16x16_P]( p_dst );
648 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
649 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
650 for( int i = 0; i < 4; i++ )
652 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
653 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
658 for( ; *predict_mode >= 0; predict_mode++ )
661 int i_mode = *predict_mode;
663 if( h->mb.b_lossless )
664 x264_predict_lossless_16x16( h, i_mode );
666 h->predict_16x16[i_mode]( p_dst );
668 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
669 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
670 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
671 a->i_satd_i16x16_dir[i_mode] = i_satd;
675 if( h->sh.i_type == SLICE_TYPE_B )
676 /* cavlc mb type prefix */
677 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
679 /* Not heavily tuned */
680 const uint8_t i16x16_thresh[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
681 if( a->b_fast_intra && a->i_satd_i16x16 > (i16x16_thresh[h->mb.i_subpel_refine]*i_satd_inter)>>1 )
684 /* 8x8 prediction selection */
685 if( flags & X264_ANALYSE_I8x8 )
687 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
688 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
689 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
691 // FIXME some bias like in i4x4?
692 int i_cost = a->i_lambda * 4; /* base predmode costs */
693 h->mb.i_cbp_luma = 0;
694 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
696 if( h->sh.i_type == SLICE_TYPE_B )
697 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
699 for( idx = 0;; idx++ )
703 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
704 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
705 int i_best = COST_MAX;
706 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
708 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
709 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
711 if( b_merged_satd && predict_mode[8] >= 0 )
714 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
715 satd[i_pred_mode] -= 3 * a->i_lambda;
716 for( int i = 2; i >= 0; i-- )
718 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i];
719 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
724 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
727 int i_mode = *predict_mode;
729 if( h->mb.b_lossless )
730 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
732 h->predict_8x8[i_mode]( p_dst_by, edge );
734 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
735 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
736 i_satd -= 3 * a->i_lambda;
738 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
739 a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * a->i_lambda;
741 i_cost += i_best + 3 * a->i_lambda;
743 if( idx == 3 || i_cost > i_satd_thresh )
746 /* we need to encode this block now (for next ones) */
747 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
748 x264_mb_encode_i8x8( h, idx, a->i_qp );
750 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
755 a->i_satd_i8x8 = i_cost;
756 if( h->mb.i_skip_intra )
758 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
759 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
760 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
761 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
762 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
763 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
764 if( h->mb.i_skip_intra == 2 )
765 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
770 static const uint16_t cost_div_fix8[3] = {1024,512,341};
771 a->i_satd_i8x8 = COST_MAX;
772 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
774 /* Not heavily tuned */
775 const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
776 if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
780 /* 4x4 prediction selection */
781 if( flags & X264_ANALYSE_I4x4 )
783 int i_cost = a->i_lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
784 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
785 h->mb.i_cbp_luma = 0;
786 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
788 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
790 if( h->sh.i_type == SLICE_TYPE_B )
791 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
793 for( idx = 0;; idx++ )
795 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
796 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
797 int i_best = COST_MAX;
798 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
800 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
802 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
803 /* emulate missing topright samples */
804 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
806 if( b_merged_satd && predict_mode[5] >= 0 )
809 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
810 satd[i_pred_mode] -= 3 * a->i_lambda;
811 for( int i = 2; i >= 0; i-- )
812 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
818 for( ; *predict_mode >= 0; predict_mode++ )
821 int i_mode = *predict_mode;
823 if( h->mb.b_lossless )
824 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
826 h->predict_4x4[i_mode]( p_dst_by );
828 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
829 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
831 i_satd -= a->i_lambda * 3;
835 a->i_predict4x4[idx] = i_mode;
840 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
843 i_cost += i_best + 3 * a->i_lambda;
845 if( i_cost > i_satd_thresh || idx == 15 )
848 /* we need to encode this block now (for next ones) */
849 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
850 x264_mb_encode_i4x4( h, idx, a->i_qp );
852 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
856 a->i_satd_i4x4 = i_cost;
857 if( h->mb.i_skip_intra )
859 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
860 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
861 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
862 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
863 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
864 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
865 if( h->mb.i_skip_intra == 2 )
866 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
870 a->i_satd_i4x4 = COST_MAX;
874 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
876 if( a->i_satd_i16x16 <= i_satd_thresh )
878 h->mb.i_type = I_16x16;
879 x264_analyse_update_cache( h, a );
880 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
883 a->i_satd_i16x16 = COST_MAX;
885 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
887 h->mb.i_type = I_4x4;
888 x264_analyse_update_cache( h, a );
889 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
892 a->i_satd_i4x4 = COST_MAX;
894 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
896 h->mb.i_type = I_8x8;
897 x264_analyse_update_cache( h, a );
898 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
899 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
902 a->i_satd_i8x8 = COST_MAX;
905 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
907 uint8_t *p_dst = h->mb.pic.p_fdec[0];
910 uint64_t i_satd, i_best;
911 h->mb.i_skip_intra = 0;
913 if( h->mb.i_type == I_16x16 )
915 int old_pred_mode = a->i_predict16x16;
916 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
917 int i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
918 i_best = a->i_satd_i16x16;
919 for( ; *predict_mode >= 0; predict_mode++ )
921 int i_mode = *predict_mode;
922 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
924 h->mb.i_intra16x16_pred_mode = i_mode;
925 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
926 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
930 /* RD selection for chroma prediction */
931 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
932 if( predict_mode[1] >= 0 )
934 int8_t predict_mode_sorted[4];
936 int i_thresh = a->i_satd_i8x8chroma * 5/4;
938 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
940 int i_mode = *predict_mode;
941 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
942 predict_mode_sorted[i_max++] = i_mode;
947 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
948 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
949 /* the previous thing encoded was x264_intra_rd(), so the pixels and
950 * coefs for the current chroma mode are still around, so we only
951 * have to recount the bits. */
952 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
953 for( int i = 0; i < i_max; i++ )
955 int i_mode = predict_mode_sorted[i];
956 if( h->mb.b_lossless )
957 x264_predict_lossless_8x8_chroma( h, i_mode );
960 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
961 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
963 /* if we've already found a mode that needs no residual, then
964 * probably any mode with a residual will be worse.
965 * so avoid dct on the remaining modes to improve speed. */
966 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
967 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
969 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
970 h->mb.i_cbp_chroma = i_cbp_chroma_best;
974 if( h->mb.i_type == I_4x4 )
976 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
978 for( int idx = 0; idx < 16; idx++ )
980 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
983 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
985 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
986 /* emulate missing topright samples */
987 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
989 for( ; *predict_mode >= 0; predict_mode++ )
991 int i_mode = *predict_mode;
992 if( h->mb.b_lossless )
993 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
995 h->predict_4x4[i_mode]( p_dst_by );
996 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
998 if( i_best > i_satd )
1000 a->i_predict4x4[idx] = i_mode;
1002 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1003 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1004 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1005 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1006 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1010 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1011 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1012 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1013 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1014 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1016 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1019 else if( h->mb.i_type == I_8x8 )
1021 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1022 for( int idx = 0; idx < 4; idx++ )
1024 uint64_t pels_h = 0;
1026 uint16_t i_nnz[2] = {0}; //shut up gcc
1028 int cbp_luma_new = 0;
1029 int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1031 i_best = COST_MAX64;
1035 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1036 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1037 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1039 for( ; *predict_mode >= 0; predict_mode++ )
1041 int i_mode = *predict_mode;
1042 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1045 if( h->mb.b_lossless )
1046 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1048 h->predict_8x8[i_mode]( p_dst_by, edge );
1049 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1050 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1052 if( i_best > i_satd )
1054 a->i_predict8x8[idx] = i_mode;
1055 cbp_luma_new = h->mb.i_cbp_luma;
1058 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1060 for( int j = 0; j < 7; j++ )
1061 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1062 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1063 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1066 a->i_cbp_i8x8_luma = cbp_luma_new;
1067 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1069 for( int j = 0; j < 7; j++ )
1070 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1071 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1072 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1074 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1079 #define LOAD_FENC( m, src, xoff, yoff) \
1080 (m)->p_cost_mv = a->p_cost_mv; \
1081 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1082 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1083 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1084 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1085 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1087 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1088 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1089 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1090 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1091 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1092 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1093 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1094 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1095 (m)->weight = weight_none; \
1098 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1099 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1100 (m)->weight = h->sh.weight[i_ref];
1102 #define REF_COST(list, ref) \
1103 (a->p_cost_ref[list][ref])
1105 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1109 ALIGNED_4( int16_t mvc[8][2] );
1110 int i_halfpel_thresh = INT_MAX;
1111 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1113 /* 16x16 Search on all ref frame */
1114 m.i_pixel = PIXEL_16x16;
1115 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1117 a->l0.me16x16.cost = INT_MAX;
1118 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1120 m.i_ref_cost = REF_COST( 0, i_ref );
1121 i_halfpel_thresh -= m.i_ref_cost;
1123 /* search with ref */
1124 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1125 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1127 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1129 if( h->mb.ref_blind_dupe == i_ref )
1131 CP32( m.mv, a->l0.mvc[0][0] );
1132 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1136 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1137 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1140 /* save mv for predicting neighbors */
1141 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1142 CP32( a->l0.mvc[i_ref][0], m.mv );
1144 /* early termination
1145 * SSD threshold would probably be better than SATD */
1148 && m.cost-m.cost_mv < 300*a->i_lambda
1149 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1150 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1151 && x264_macroblock_probe_pskip( h ) )
1153 h->mb.i_type = P_SKIP;
1154 x264_analyse_update_cache( h, a );
1155 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1159 m.cost += m.i_ref_cost;
1160 i_halfpel_thresh += m.i_ref_cost;
1162 if( m.cost < a->l0.me16x16.cost )
1163 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1166 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1167 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1169 h->mb.i_type = P_L0;
1172 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1173 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1175 h->mb.i_partition = D_16x16;
1176 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1177 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1178 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1179 h->mb.i_type = P_SKIP;
1184 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1187 uint8_t **p_fenc = h->mb.pic.p_fenc;
1188 int i_maxref = h->mb.pic.i_fref[0]-1;
1190 h->mb.i_partition = D_8x8;
1192 #define CHECK_NEIGHBOUR(i)\
1194 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1195 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1199 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1200 * than those used by the neighbors */
1201 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1202 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1205 CHECK_NEIGHBOUR( -8 - 1 );
1206 CHECK_NEIGHBOUR( -8 + 0 );
1207 CHECK_NEIGHBOUR( -8 + 2 );
1208 CHECK_NEIGHBOUR( -8 + 4 );
1209 CHECK_NEIGHBOUR( 0 - 1 );
1210 CHECK_NEIGHBOUR( 2*8 - 1 );
1212 #undef CHECK_NEIGHBOUR
1214 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1215 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1217 for( int i = 0; i < 4; i++ )
1219 x264_me_t *l0m = &a->l0.me8x8[i];
1223 m.i_pixel = PIXEL_8x8;
1225 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1226 l0m->cost = INT_MAX;
1227 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1229 m.i_ref_cost = REF_COST( 0, i_ref );
1231 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1232 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1234 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1235 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1236 if( h->mb.ref_blind_dupe == i_ref )
1238 CP32( m.mv, a->l0.mvc[0][i+1] );
1239 x264_me_refine_qpel_refdupe( h, &m, NULL );
1242 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1244 m.cost += m.i_ref_cost;
1246 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1248 if( m.cost < l0m->cost )
1249 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1250 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1251 i_ref = h->mb.ref_blind_dupe;
1255 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1256 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1258 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1259 are effectively zero. */
1260 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1261 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1264 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1265 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1266 /* P_8x8 ref0 has no ref cost */
1267 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1268 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1269 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1270 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1271 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1274 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1276 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1277 * reference frame flags. Thus, if we're not doing mixedrefs, just
1278 * don't bother analysing the dupes. */
1279 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1280 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1281 uint8_t **p_fenc = h->mb.pic.p_fenc;
1283 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1285 /* XXX Needed for x264_mb_predict_mv */
1286 h->mb.i_partition = D_8x8;
1289 CP32( mvc[0], a->l0.me16x16.mv );
1291 for( int i = 0; i < 4; i++ )
1293 x264_me_t *m = &a->l0.me8x8[i];
1297 m->i_pixel = PIXEL_8x8;
1298 m->i_ref_cost = i_ref_cost;
1300 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1301 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1302 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1304 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1305 x264_me_search( h, m, mvc, i_mvc );
1307 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1309 CP32( mvc[i_mvc], m->mv );
1313 m->cost += i_ref_cost;
1314 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1315 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1318 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1319 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1320 /* theoretically this should include 4*ref_cost,
1321 * but 3 seems a better approximation of cabac. */
1322 if( h->param.b_cabac )
1323 a->l0.i_cost8x8 -= i_ref_cost;
1324 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1325 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1328 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1331 uint8_t **p_fenc = h->mb.pic.p_fenc;
1332 ALIGNED_4( int16_t mvc[3][2] );
1334 /* XXX Needed for x264_mb_predict_mv */
1335 h->mb.i_partition = D_16x8;
1337 for( int i = 0; i < 2; i++ )
1339 x264_me_t *l0m = &a->l0.me16x8[i];
1340 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1341 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1342 const int ref8[2] = { minref, maxref };
1343 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1345 m.i_pixel = PIXEL_16x8;
1347 LOAD_FENC( &m, p_fenc, 0, 8*i );
1348 l0m->cost = INT_MAX;
1349 for( int j = 0; j < i_ref8s; j++ )
1351 const int i_ref = ref8[j];
1352 m.i_ref_cost = REF_COST( 0, i_ref );
1354 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1355 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1356 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1357 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1359 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1360 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1362 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1363 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1364 /* We can only take this shortcut if the first search was performed on ref0. */
1365 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1367 /* We can just leave the MV from the previous ref search. */
1368 x264_me_refine_qpel_refdupe( h, &m, NULL );
1371 x264_me_search( h, &m, mvc, 3 );
1373 m.cost += m.i_ref_cost;
1375 if( m.cost < l0m->cost )
1376 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1378 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1379 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1382 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1385 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1388 uint8_t **p_fenc = h->mb.pic.p_fenc;
1389 ALIGNED_4( int16_t mvc[3][2] );
1391 /* XXX Needed for x264_mb_predict_mv */
1392 h->mb.i_partition = D_8x16;
1394 for( int i = 0; i < 2; i++ )
1396 x264_me_t *l0m = &a->l0.me8x16[i];
1397 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1398 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1399 const int ref8[2] = { minref, maxref };
1400 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1402 m.i_pixel = PIXEL_8x16;
1404 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1405 l0m->cost = INT_MAX;
1406 for( int j = 0; j < i_ref8s; j++ )
1408 const int i_ref = ref8[j];
1409 m.i_ref_cost = REF_COST( 0, i_ref );
1411 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1412 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1413 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1415 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1416 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1418 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1419 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1420 /* We can only take this shortcut if the first search was performed on ref0. */
1421 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1423 /* We can just leave the MV from the previous ref search. */
1424 x264_me_refine_qpel_refdupe( h, &m, NULL );
1427 x264_me_search( h, &m, mvc, 3 );
1429 m.cost += m.i_ref_cost;
1431 if( m.cost < l0m->cost )
1432 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1434 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1435 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1438 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1441 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1443 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1444 uint8_t *pix2 = pix1+8;
1445 const int i_stride = h->mb.pic.i_stride[1];
1446 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1447 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1448 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1449 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1450 x264_weight_t *weight = h->sh.weight[i_ref];
1452 #define CHROMA4x4MC( width, height, me, x, y ) \
1453 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1454 if( weight[1].weightfn ) \
1455 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1456 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1457 if( weight[2].weightfn ) \
1458 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1461 if( pixel == PIXEL_4x4 )
1463 x264_me_t *m = a->l0.me4x4[i8x8];
1464 CHROMA4x4MC( 2,2, m[0], 0,0 );
1465 CHROMA4x4MC( 2,2, m[1], 2,0 );
1466 CHROMA4x4MC( 2,2, m[2], 0,2 );
1467 CHROMA4x4MC( 2,2, m[3], 2,2 );
1469 else if( pixel == PIXEL_8x4 )
1471 x264_me_t *m = a->l0.me8x4[i8x8];
1472 CHROMA4x4MC( 4,2, m[0], 0,0 );
1473 CHROMA4x4MC( 4,2, m[1], 0,2 );
1477 x264_me_t *m = a->l0.me4x8[i8x8];
1478 CHROMA4x4MC( 2,4, m[0], 0,0 );
1479 CHROMA4x4MC( 2,4, m[1], 2,0 );
1482 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1483 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1486 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1488 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1489 uint8_t **p_fenc = h->mb.pic.p_fenc;
1490 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1492 /* XXX Needed for x264_mb_predict_mv */
1493 h->mb.i_partition = D_8x8;
1495 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1497 const int idx = 4*i8x8 + i4x4;
1498 const int x4 = block_idx_x[idx];
1499 const int y4 = block_idx_y[idx];
1500 const int i_mvc = (i4x4 == 0);
1502 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1504 m->i_pixel = PIXEL_4x4;
1506 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1507 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1508 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1510 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1511 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1513 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1515 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1516 a->l0.me4x4[i8x8][1].cost +
1517 a->l0.me4x4[i8x8][2].cost +
1518 a->l0.me4x4[i8x8][3].cost +
1519 REF_COST( 0, i_ref ) +
1520 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1521 if( h->mb.b_chroma_me )
1522 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1525 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1527 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1528 uint8_t **p_fenc = h->mb.pic.p_fenc;
1529 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1531 /* XXX Needed for x264_mb_predict_mv */
1532 h->mb.i_partition = D_8x8;
1534 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1536 const int idx = 4*i8x8 + 2*i8x4;
1537 const int x4 = block_idx_x[idx];
1538 const int y4 = block_idx_y[idx];
1539 const int i_mvc = (i8x4 == 0);
1541 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1543 m->i_pixel = PIXEL_8x4;
1545 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1546 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1547 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1549 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1550 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1552 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1554 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1555 REF_COST( 0, i_ref ) +
1556 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1557 if( h->mb.b_chroma_me )
1558 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1561 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1563 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1564 uint8_t **p_fenc = h->mb.pic.p_fenc;
1565 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1567 /* XXX Needed for x264_mb_predict_mv */
1568 h->mb.i_partition = D_8x8;
1570 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1572 const int idx = 4*i8x8 + i4x8;
1573 const int x4 = block_idx_x[idx];
1574 const int y4 = block_idx_y[idx];
1575 const int i_mvc = (i4x8 == 0);
1577 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1579 m->i_pixel = PIXEL_4x8;
1581 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1582 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1583 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1585 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1586 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1588 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1590 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1591 REF_COST( 0, i_ref ) +
1592 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1593 if( h->mb.b_chroma_me )
1594 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1597 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1599 /* Assumes that fdec still contains the results of
1600 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1602 uint8_t *p_fenc = h->mb.pic.p_fenc[0];
1603 uint8_t *p_fdec = h->mb.pic.p_fdec[0];
1605 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1606 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1607 for( int i = 0; i < 4; i++ )
1609 const int x = (i&1)*8;
1610 const int y = (i>>1)*8;
1611 a->i_cost16x16direct +=
1612 a->i_cost8x8direct[i] =
1613 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1616 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1619 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
1622 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1624 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1625 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1626 uint8_t *src0, *src1;
1627 int stride0 = 16, stride1 = 16;
1629 ALIGNED_4( int16_t mvc[9][2] );
1630 int try_skip = a->b_try_skip;
1631 int list1_skipped = 0;
1632 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1633 int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL,
1634 h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL};
1637 m.i_pixel = PIXEL_16x16;
1639 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1641 /* 16x16 Search on list 0 and list 1 */
1642 a->l0.me16x16.cost = INT_MAX;
1643 a->l1.me16x16.cost = INT_MAX;
1644 for( int l = 1; l >= 0; )
1646 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1648 /* This loop is extremely munged in order to facilitate the following order of operations,
1649 * necessary for an efficient fast skip.
1650 * 1. Search list1 ref0.
1651 * 2. Search list0 ref0.
1653 * 4. Search the rest of list0.
1654 * 5. Go back and finish list1.
1656 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1658 if( try_skip && l == 1 && i_ref > 0 )
1664 m.i_ref_cost = REF_COST( l, i_ref );
1666 /* search with ref */
1667 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1668 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1669 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1670 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
1673 m.cost += m.i_ref_cost;
1675 if( m.cost < lX->me16x16.cost )
1676 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1678 /* save mv for predicting neighbors */
1679 CP32( lX->mvc[i_ref][0], m.mv );
1680 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1682 /* Fast skip detection. */
1683 if( i_ref == 0 && try_skip )
1685 if( abs(lX->bi16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
1686 abs(lX->bi16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
1692 /* We already tested skip */
1693 h->mb.i_type = B_SKIP;
1694 x264_analyse_update_cache( h, a );
1699 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
1701 if( list1_skipped && l == 0 )
1707 /* get cost of BI mode */
1708 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1709 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1710 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1711 src0 = h->mc.get_ref( pix0, &stride0,
1712 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1713 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1714 src1 = h->mc.get_ref( pix1, &stride1,
1715 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1716 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1718 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1720 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1722 + a->l0.bi16x16.cost_mv
1723 + a->l1.bi16x16.cost_mv;
1725 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1726 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1728 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1729 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1730 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1731 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1732 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1733 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1734 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1735 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1736 + ref_costs + l0_mv_cost + l1_mv_cost;
1737 if( cost00 < a->i_cost16x16bi )
1739 M32( a->l0.bi16x16.mv ) = 0;
1740 M32( a->l1.bi16x16.mv ) = 0;
1741 a->l0.bi16x16.cost_mv = l0_mv_cost;
1742 a->l1.bi16x16.cost_mv = l1_mv_cost;
1743 a->i_cost16x16bi = cost00;
1748 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1749 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1750 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1753 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1755 const int x = 2*(i%2);
1756 const int y = 2*(i/2);
1758 switch( h->mb.i_sub_partition[i] )
1761 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1764 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1765 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1768 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1769 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1772 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1773 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1774 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1775 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1778 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1783 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1785 const int x = 2*(idx&1);
1786 const int y = 2*(idx>>1);
1787 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1788 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1789 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1790 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1793 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1794 if( x264_mb_partition_listX_table[0][part] ) \
1796 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1797 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1801 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1802 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1804 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1806 if( x264_mb_partition_listX_table[1][part] ) \
1808 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1809 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1813 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1814 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1816 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1819 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1823 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1825 x264_mb_load_mv_direct8x8( h, i );
1828 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1829 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1830 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1835 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1838 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1840 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1842 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1844 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1848 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1850 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1851 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1853 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1854 * than those used by the neighbors */
1855 #define CHECK_NEIGHBOUR(i)\
1857 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1858 if( ref > i_maxref[l] )\
1862 for( int l = 0; l < 2; l++ )
1864 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1865 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1866 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1869 CHECK_NEIGHBOUR( -8 - 1 );
1870 CHECK_NEIGHBOUR( -8 + 0 );
1871 CHECK_NEIGHBOUR( -8 + 2 );
1872 CHECK_NEIGHBOUR( -8 + 4 );
1873 CHECK_NEIGHBOUR( 0 - 1 );
1874 CHECK_NEIGHBOUR( 2*8 - 1 );
1878 /* XXX Needed for x264_mb_predict_mv */
1879 h->mb.i_partition = D_8x8;
1883 for( int i = 0; i < 4; i++ )
1889 int stride[2] = {8,8};
1892 m.i_pixel = PIXEL_8x8;
1893 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1895 for( int l = 0; l < 2; l++ )
1897 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1899 lX->me8x8[i].cost = INT_MAX;
1900 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1902 m.i_ref_cost = REF_COST( l, i_ref );;
1904 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1906 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1907 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1908 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1909 m.cost += m.i_ref_cost;
1911 if( m.cost < lX->me8x8[i].cost )
1912 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1914 /* save mv for predicting other partitions within this MB */
1915 CP32( lX->mvc[i_ref][i+1], m.mv );
1920 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1921 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1922 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1923 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1924 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1925 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1927 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1928 + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv + a->l0.me8x8[i].i_ref_cost
1929 + a->l1.me8x8[i].i_ref_cost + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1931 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1932 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1934 i_part_cost = a->l0.me8x8[i].cost;
1935 h->mb.i_sub_partition[i] = D_L0_8x8;
1936 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1937 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1938 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1939 a->i_cost8x8bi += i_part_cost;
1941 /* XXX Needed for x264_mb_predict_mv */
1942 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1946 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1949 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1951 uint8_t **p_fref[2] =
1952 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
1953 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
1954 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1956 /* XXX Needed for x264_mb_predict_mv */
1957 h->mb.i_partition = D_8x8;
1961 for( int i = 0; i < 4; i++ )
1966 int i_part_cost_bi = 0;
1967 int stride[2] = {8,8};
1970 for( int l = 0; l < 2; l++ )
1972 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1973 x264_me_t *m = &lX->me8x8[i];
1974 m->i_pixel = PIXEL_8x8;
1975 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1977 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
1978 m->i_ref = lX->me16x16.i_ref;
1980 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
1982 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
1983 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1984 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1985 m->cost += m->i_ref_cost;
1987 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1989 /* save mv for predicting other partitions within this MB */
1990 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
1993 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1994 m->mv[0], m->mv[1], 8, 8, weight_none );
1995 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
1997 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
1998 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1999 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2000 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2001 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2003 i_part_cost = a->l0.me8x8[i].cost;
2004 h->mb.i_sub_partition[i] = D_L0_8x8;
2005 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2006 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2007 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2008 a->i_cost8x8bi += i_part_cost;
2010 /* XXX Needed for x264_mb_predict_mv */
2011 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2015 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2018 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
2020 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
2021 ALIGNED_4( int16_t mvc[3][2] );
2023 h->mb.i_partition = D_16x8;
2024 a->i_cost16x8bi = 0;
2026 for( int i = 0; i < 2; i++ )
2029 int i_part_cost_bi = 0;
2030 int stride[2] = {16,16};
2033 m.i_pixel = PIXEL_16x8;
2034 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2036 for( int l = 0; l < 2; l++ )
2038 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2039 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2040 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2041 lX->me16x8[i].cost = INT_MAX;
2042 for( int j = 0; j < i_ref8s; j++ )
2044 int i_ref = ref8[j];
2045 m.i_ref_cost = REF_COST( l, i_ref );;
2047 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2049 CP32( mvc[0], lX->mvc[i_ref][0] );
2050 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2051 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2053 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2054 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2055 x264_me_search( h, &m, mvc, 3 );
2056 m.cost += m.i_ref_cost;
2058 if( m.cost < lX->me16x8[i].cost )
2059 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2064 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2065 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2066 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2067 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2068 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2069 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2071 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2072 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2073 + a->l1.me16x8[i].i_ref_cost;
2075 i_part_cost = a->l0.me16x8[i].cost;
2076 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2078 if( a->l1.me16x8[i].cost < i_part_cost )
2080 i_part_cost = a->l1.me16x8[i].cost;
2081 a->i_mb_partition16x8[i] = D_L1_8x8;
2083 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2085 i_part_cost = i_part_cost_bi;
2086 a->i_mb_partition16x8[i] = D_BI_8x8;
2088 a->i_cost16x8bi += i_part_cost;
2090 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2094 a->i_mb_type16x8 = B_L0_L0
2095 + (a->i_mb_partition16x8[0]>>2) * 3
2096 + (a->i_mb_partition16x8[1]>>2);
2097 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2100 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2102 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2103 ALIGNED_4( int16_t mvc[3][2] );
2105 h->mb.i_partition = D_8x16;
2106 a->i_cost8x16bi = 0;
2108 for( int i = 0; i < 2; i++ )
2111 int i_part_cost_bi = 0;
2112 int stride[2] = {8,8};
2115 m.i_pixel = PIXEL_8x16;
2116 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2118 for( int l = 0; l < 2; l++ )
2120 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2121 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2122 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2123 lX->me8x16[i].cost = INT_MAX;
2124 for( int j = 0; j < i_ref8s; j++ )
2126 int i_ref = ref8[j];
2127 m.i_ref_cost = REF_COST( l, i_ref );
2129 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2131 CP32( mvc[0], lX->mvc[i_ref][0] );
2132 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2133 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2135 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2136 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2137 x264_me_search( h, &m, mvc, 3 );
2138 m.cost += m.i_ref_cost;
2140 if( m.cost < lX->me8x16[i].cost )
2141 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2146 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2147 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2148 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2149 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2150 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2152 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2153 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2154 + a->l1.me8x16[i].i_ref_cost;
2156 i_part_cost = a->l0.me8x16[i].cost;
2157 a->i_mb_partition8x16[i] = D_L0_8x8;
2159 if( a->l1.me8x16[i].cost < i_part_cost )
2161 i_part_cost = a->l1.me8x16[i].cost;
2162 a->i_mb_partition8x16[i] = D_L1_8x8;
2164 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2166 i_part_cost = i_part_cost_bi;
2167 a->i_mb_partition8x16[i] = D_BI_8x8;
2169 a->i_cost8x16bi += i_part_cost;
2171 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2175 a->i_mb_type8x16 = B_L0_L0
2176 + (a->i_mb_partition8x16[0]>>2) * 3
2177 + (a->i_mb_partition8x16[1]>>2);
2178 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2181 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2183 int thresh = i_satd * 5/4;
2185 h->mb.i_type = P_L0;
2186 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2188 h->mb.i_partition = D_16x16;
2189 x264_analyse_update_cache( h, a );
2190 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2193 if( a->l0.i_cost16x8 <= thresh )
2195 h->mb.i_partition = D_16x8;
2196 x264_analyse_update_cache( h, a );
2197 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2200 a->l0.i_cost16x8 = COST_MAX;
2202 if( a->l0.i_cost8x16 <= thresh )
2204 h->mb.i_partition = D_8x16;
2205 x264_analyse_update_cache( h, a );
2206 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2209 a->l0.i_cost8x16 = COST_MAX;
2211 if( a->l0.i_cost8x8 <= thresh )
2213 h->mb.i_type = P_8x8;
2214 h->mb.i_partition = D_8x8;
2215 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2217 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2218 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2219 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2220 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2221 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2222 * for future blocks are those left over from previous RDO calls. */
2223 for( int i = 0; i < 4; i++ )
2225 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2226 int sub8x8_thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2227 int subtype, btype = D_L0_8x8;
2228 uint64_t bcost = COST_MAX64;
2229 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2232 if( costs[subtype] > sub8x8_thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2234 h->mb.i_sub_partition[i] = subtype;
2235 x264_mb_cache_mv_p8x8( h, a, i );
2236 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2237 COPY2_IF_LT( bcost, cost, btype, subtype );
2239 if( h->mb.i_sub_partition[i] != btype )
2241 h->mb.i_sub_partition[i] = btype;
2242 x264_mb_cache_mv_p8x8( h, a, i );
2247 x264_analyse_update_cache( h, a );
2248 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2251 a->l0.i_cost8x8 = COST_MAX;
2254 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2256 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2258 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2260 h->mb.i_type = B_DIRECT;
2261 /* Assumes direct/skip MC is still in fdec */
2262 /* Requires b-rdo to be done before intra analysis */
2263 h->mb.b_skip_mc = 1;
2264 x264_analyse_update_cache( h, a );
2265 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2266 h->mb.b_skip_mc = 0;
2269 //FIXME not all the update_cache calls are needed
2270 h->mb.i_partition = D_16x16;
2272 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2274 h->mb.i_type = B_L0_L0;
2275 x264_analyse_update_cache( h, a );
2276 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2280 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2282 h->mb.i_type = B_L1_L1;
2283 x264_analyse_update_cache( h, a );
2284 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2288 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2290 h->mb.i_type = B_BI_BI;
2291 x264_analyse_update_cache( h, a );
2292 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2296 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2298 h->mb.i_type = B_8x8;
2299 h->mb.i_partition = D_8x8;
2300 x264_analyse_update_cache( h, a );
2301 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2302 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2306 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2308 h->mb.i_type = a->i_mb_type16x8;
2309 h->mb.i_partition = D_16x8;
2310 x264_analyse_update_cache( h, a );
2311 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2315 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2317 h->mb.i_type = a->i_mb_type8x16;
2318 h->mb.i_partition = D_8x16;
2319 x264_analyse_update_cache( h, a );
2320 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2324 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2328 if( IS_INTRA(h->mb.i_type) )
2331 switch( h->mb.i_partition )
2334 if( h->mb.i_type == B_BI_BI )
2336 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2337 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2341 for( int i = 0; i < 2; i++ )
2342 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2344 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2345 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2349 for( int i = 0; i < 2; i++ )
2350 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2352 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2353 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2357 for( int i = 0; i < 4; i++ )
2358 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2360 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2361 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2367 static inline void x264_mb_analyse_transform( x264_t *h )
2369 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2371 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2374 int i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2375 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2376 int i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2377 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2379 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2380 h->mb.b_skip_mc = 1;
2384 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2386 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2388 x264_analyse_update_cache( h, a );
2389 h->mb.b_transform_8x8 ^= 1;
2390 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2391 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2393 if( *i_rd >= i_rd8 )
2396 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2400 h->mb.b_transform_8x8 ^= 1;
2404 /* Rate-distortion optimal QP selection.
2405 * FIXME: More than half of the benefit of this function seems to be
2406 * in the way it improves the coding of chroma DC (by decimating or
2407 * finding a better way to code a single DC coefficient.)
2408 * There must be a more efficient way to get that portion of the benefit
2409 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2411 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2413 int bcost, cost, failures, prevcost, origcost;
2414 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2415 int last_qp_tried = 0;
2416 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2417 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2419 /* If CBP is already zero, don't raise the quantizer any higher. */
2420 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2422 /* Without psy-RD, require monotonicity when moving quant away from previous
2423 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2424 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2425 * allow 2 failures when moving quant towards previous quant.
2426 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2427 int threshold = (!!h->mb.i_psy_rd);
2428 /* Raise the threshold for failures if we're moving towards the last QP. */
2429 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2430 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2432 h->mb.i_qp = orig_qp;
2434 prevcost = origcost;
2436 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2437 * (up to a point) will too. So, jump down to where the threshold will kick in
2438 * and check the QP there. If the CBP is still empty, skip the main loop.
2439 * If it isn't empty, we would have ended up having to check this QP anyways,
2440 * so as long as we store it for later lookup, we lose nothing. */
2441 int already_checked_qp = -1;
2442 int already_checked_cost = COST_MAX;
2443 if( direction == -1 )
2447 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2448 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2449 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2450 if( !h->mb.cbp[h->mb.i_mb_xy] )
2452 /* If our empty-CBP block is lower QP than the last QP,
2453 * the last QP almost surely doesn't have a CBP either. */
2454 if( h->mb.i_last_qp > h->mb.i_qp )
2458 already_checked_qp = h->mb.i_qp;
2459 h->mb.i_qp = orig_qp;
2463 h->mb.i_qp += direction;
2464 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2466 if( h->mb.i_last_qp == h->mb.i_qp )
2468 if( h->mb.i_qp == already_checked_qp )
2469 cost = already_checked_cost;
2472 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2473 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2474 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2477 /* We can't assume that the costs are monotonic over QPs.
2478 * Tie case-as-failure seems to give better results. */
2479 if( cost < prevcost )
2485 if( failures > threshold )
2487 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2489 h->mb.i_qp += direction;
2493 /* Always try the last block's QP. */
2494 if( !last_qp_tried )
2496 h->mb.i_qp = h->mb.i_last_qp;
2497 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2498 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2499 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2503 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2505 /* Check transform again; decision from before may no longer be optimal. */
2506 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2507 x264_mb_transform_8x8_allowed( h ) )
2509 h->mb.b_transform_8x8 ^= 1;
2510 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2512 h->mb.b_transform_8x8 ^= 1;
2516 /*****************************************************************************
2517 * x264_macroblock_analyse:
2518 *****************************************************************************/
2519 void x264_macroblock_analyse( x264_t *h )
2521 x264_mb_analysis_t analysis;
2522 int i_cost = COST_MAX;
2524 h->mb.i_qp = x264_ratecontrol_qp( h );
2525 if( h->param.rc.i_aq_mode )
2527 x264_adaptive_quant( h );
2528 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2529 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2530 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2531 h->mb.i_qp = h->mb.i_last_qp;
2534 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2536 /*--------------------------- Do the analysis ---------------------------*/
2537 if( h->sh.i_type == SLICE_TYPE_I )
2540 if( analysis.i_mbrd )
2541 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2542 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2543 if( analysis.i_mbrd )
2544 x264_intra_rd( h, &analysis, COST_MAX );
2546 i_cost = analysis.i_satd_i16x16;
2547 h->mb.i_type = I_16x16;
2548 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2549 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2550 if( analysis.i_satd_pcm < i_cost )
2551 h->mb.i_type = I_PCM;
2553 else if( analysis.i_mbrd >= 2 )
2554 x264_intra_rd_refine( h, &analysis );
2556 else if( h->sh.i_type == SLICE_TYPE_P )
2560 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2562 analysis.b_try_skip = 0;
2563 if( analysis.b_force_intra )
2565 if( !h->param.analyse.b_psy )
2567 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2568 goto intra_analysis;
2573 /* Fast P_SKIP detection */
2574 if( h->param.analyse.b_fast_pskip )
2576 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2577 // FIXME don't need to check this if the reference frame is done
2579 else if( h->param.analyse.i_subpel_refine >= 3 )
2580 analysis.b_try_skip = 1;
2581 else if( h->mb.i_mb_type_left == P_SKIP ||
2582 h->mb.i_mb_type_top == P_SKIP ||
2583 h->mb.i_mb_type_topleft == P_SKIP ||
2584 h->mb.i_mb_type_topright == P_SKIP )
2585 b_skip = x264_macroblock_probe_pskip( h );
2589 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2593 h->mb.i_type = P_SKIP;
2594 h->mb.i_partition = D_16x16;
2595 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2596 /* Set up MVs for future predictors */
2598 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2599 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2603 const unsigned int flags = h->param.analyse.inter;
2607 int i_satd_inter, i_satd_intra;
2609 x264_mb_analyse_load_costs( h, &analysis );
2611 x264_mb_analyse_inter_p16x16( h, &analysis );
2613 if( h->mb.i_type == P_SKIP )
2615 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2616 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2620 if( flags & X264_ANALYSE_PSUB16x16 )
2622 if( h->param.analyse.b_mixed_references )
2623 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2625 x264_mb_analyse_inter_p8x8( h, &analysis );
2628 /* Select best inter mode */
2630 i_partition = D_16x16;
2631 i_cost = analysis.l0.me16x16.cost;
2633 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2634 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2637 i_partition = D_8x8;
2638 i_cost = analysis.l0.i_cost8x8;
2641 if( flags & X264_ANALYSE_PSUB8x8 )
2643 for( int i = 0; i < 4; i++ )
2645 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2646 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2648 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2649 h->mb.i_sub_partition[i] = D_L0_4x4;
2651 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2652 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2653 h->mb.i_sub_partition[i], D_L0_8x4 );
2655 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2656 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2657 h->mb.i_sub_partition[i], D_L0_4x8 );
2659 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2661 x264_mb_cache_mv_p8x8( h, &analysis, i );
2663 analysis.l0.i_cost8x8 = i_cost;
2667 /* Now do 16x8/8x16 */
2668 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2669 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2670 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2672 x264_mb_analyse_inter_p16x8( h, &analysis );
2673 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2675 x264_mb_analyse_inter_p8x16( h, &analysis );
2676 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2679 h->mb.i_partition = i_partition;
2682 //FIXME mb_type costs?
2683 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2687 else if( i_partition == D_16x16 )
2689 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2690 i_cost = analysis.l0.me16x16.cost;
2692 else if( i_partition == D_16x8 )
2694 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2695 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2696 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2698 else if( i_partition == D_8x16 )
2700 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2701 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2702 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2704 else if( i_partition == D_8x8 )
2707 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2709 switch( h->mb.i_sub_partition[i8x8] )
2712 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2713 i_cost += analysis.l0.me8x8[i8x8].cost;
2716 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2717 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2718 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2719 analysis.l0.me8x4[i8x8][1].cost;
2722 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2723 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2724 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2725 analysis.l0.me4x8[i8x8][1].cost;
2729 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2730 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2731 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2732 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2733 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2734 analysis.l0.me4x4[i8x8][1].cost +
2735 analysis.l0.me4x4[i8x8][2].cost +
2736 analysis.l0.me4x4[i8x8][3].cost;
2739 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2745 if( h->mb.b_chroma_me )
2747 x264_mb_analyse_intra_chroma( h, &analysis );
2748 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2749 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2750 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2751 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2754 x264_mb_analyse_intra( h, &analysis, i_cost );
2756 i_satd_inter = i_cost;
2757 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2758 analysis.i_satd_i8x8,
2759 analysis.i_satd_i4x4 );
2761 if( analysis.i_mbrd )
2763 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2765 i_partition = D_16x16;
2766 i_cost = analysis.l0.i_rd16x16;
2767 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2768 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2769 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2770 h->mb.i_type = i_type;
2771 h->mb.i_partition = i_partition;
2772 if( i_cost < COST_MAX )
2773 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2774 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2777 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2778 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2779 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2780 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2782 h->mb.i_type = i_type;
2784 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2786 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2787 * it was an inter block. */
2788 x264_analyse_update_cache( h, &analysis );
2789 x264_macroblock_encode( h );
2790 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2791 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2792 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2793 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2794 goto intra_analysis;
2797 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2799 if( IS_INTRA( h->mb.i_type ) )
2801 x264_intra_rd_refine( h, &analysis );
2803 else if( i_partition == D_16x16 )
2805 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2806 analysis.l0.me16x16.cost = i_cost;
2807 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2809 else if( i_partition == D_16x8 )
2811 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2812 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2813 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2814 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2815 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2816 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2818 else if( i_partition == D_8x16 )
2820 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2821 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2822 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2823 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2824 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2825 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2827 else if( i_partition == D_8x8 )
2829 x264_analyse_update_cache( h, &analysis );
2830 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
2832 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2834 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2836 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2838 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2839 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2841 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2843 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2844 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2846 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2848 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2849 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2850 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2851 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2858 else if( h->sh.i_type == SLICE_TYPE_B )
2860 int i_bskip_cost = COST_MAX;
2863 if( analysis.i_mbrd )
2864 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2866 h->mb.i_type = B_SKIP;
2867 if( h->mb.b_direct_auto_write )
2869 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2870 for( int i = 0; i < 2; i++ )
2873 h->sh.b_direct_spatial_mv_pred ^= 1;
2874 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2875 if( analysis.b_direct_available )
2880 b_skip = x264_macroblock_probe_bskip( h );
2882 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2889 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2891 analysis.b_try_skip = 0;
2892 if( analysis.b_direct_available )
2894 if( !h->mb.b_direct_auto_write )
2896 if( analysis.i_mbrd )
2898 i_bskip_cost = ssd_mb( h );
2899 /* 6 = minimum cavlc cost of a non-skipped MB */
2900 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2902 else if( !h->mb.b_direct_auto_write )
2904 /* Conditioning the probe on neighboring block types
2905 * doesn't seem to help speed or quality. */
2906 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
2907 if( h->param.analyse.i_subpel_refine < 3 )
2908 b_skip = analysis.b_try_skip;
2910 /* Set up MVs for future predictors */
2913 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2914 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2915 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
2916 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
2922 const unsigned int flags = h->param.analyse.inter;
2926 h->mb.b_skip_mc = 0;
2927 h->mb.i_type = B_DIRECT;
2929 x264_mb_analyse_load_costs( h, &analysis );
2931 /* select best inter mode */
2932 /* direct must be first */
2933 if( analysis.b_direct_available )
2934 x264_mb_analyse_inter_direct( h, &analysis );
2936 x264_mb_analyse_inter_b16x16( h, &analysis );
2938 if( h->mb.i_type == B_SKIP )
2940 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2941 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2942 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
2943 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2948 i_partition = D_16x16;
2949 i_cost = analysis.l0.me16x16.cost;
2950 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2951 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2952 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2954 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2956 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2957 if( i_bskip_cost < analysis.i_rd16x16direct &&
2958 i_bskip_cost < analysis.i_rd16x16bi &&
2959 i_bskip_cost < analysis.l0.i_rd16x16 &&
2960 i_bskip_cost < analysis.l1.i_rd16x16 )
2962 h->mb.i_type = B_SKIP;
2963 x264_analyse_update_cache( h, &analysis );
2968 if( flags & X264_ANALYSE_BSUB16x16 )
2970 if( h->param.analyse.b_mixed_references )
2971 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
2973 x264_mb_analyse_inter_b8x8( h, &analysis );
2975 if( analysis.i_cost8x8bi < i_cost )
2978 i_partition = D_8x8;
2979 i_cost = analysis.i_cost8x8bi;
2981 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2982 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2984 x264_mb_analyse_inter_b16x8( h, &analysis );
2985 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2986 i_type, analysis.i_mb_type16x8,
2987 i_partition, D_16x8 );
2989 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2990 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2992 x264_mb_analyse_inter_b8x16( h, &analysis );
2993 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2994 i_type, analysis.i_mb_type8x16,
2995 i_partition, D_8x16 );
3000 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3005 else if( i_partition == D_16x16 )
3007 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3008 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3009 if( i_type == B_L0_L0 )
3011 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3012 i_cost = analysis.l0.me16x16.cost
3013 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3015 else if( i_type == B_L1_L1 )
3017 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3018 i_cost = analysis.l1.me16x16.cost
3019 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3021 else if( i_type == B_BI_BI )
3023 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3024 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3027 else if( i_partition == D_16x8 )
3029 for( int i = 0; i < 2; i++ )
3031 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3032 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3033 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3034 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3037 else if( i_partition == D_8x16 )
3039 for( int i = 0; i < 2; i++ )
3041 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3042 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3043 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3044 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3047 else if( i_partition == D_8x8 )
3049 for( int i = 0; i < 4; i++ )
3052 int i_part_cost_old;
3054 int i_part_type = h->mb.i_sub_partition[i];
3055 int b_bidir = (i_part_type == D_BI_8x8);
3057 if( i_part_type == D_DIRECT_8x8 )
3059 if( x264_mb_partition_listX_table[0][i_part_type] )
3061 m = &analysis.l0.me8x8[i];
3062 i_part_cost_old = m->cost;
3063 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3064 m->cost -= i_type_cost;
3065 x264_me_refine_qpel( h, m );
3067 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3069 if( x264_mb_partition_listX_table[1][i_part_type] )
3071 m = &analysis.l1.me8x8[i];
3072 i_part_cost_old = m->cost;
3073 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3074 m->cost -= i_type_cost;
3075 x264_me_refine_qpel( h, m );
3077 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3079 /* TODO: update mvp? */
3083 i_satd_inter = i_cost;
3085 if( analysis.i_mbrd )
3087 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3089 i_cost = i_bskip_cost;
3090 i_partition = D_16x16;
3091 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3092 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3093 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3094 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3095 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3096 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3097 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3099 h->mb.i_type = i_type;
3100 h->mb.i_partition = i_partition;
3103 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3105 if( analysis.i_mbrd )
3107 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3108 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3111 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3112 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3113 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3114 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3116 h->mb.i_type = i_type;
3117 h->mb.i_partition = i_partition;
3119 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3120 x264_intra_rd_refine( h, &analysis );
3121 if( h->mb.i_subpel_refine >= 5 )
3122 x264_refine_bidir( h, &analysis );
3124 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3127 x264_analyse_update_cache( h, &analysis );
3129 if( i_partition == D_16x16 )
3131 if( i_type == B_L0_L0 )
3133 analysis.l0.me16x16.cost = i_cost;
3134 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3136 else if( i_type == B_L1_L1 )
3138 analysis.l1.me16x16.cost = i_cost;
3139 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3141 else if( i_type == B_BI_BI )
3143 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3144 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3147 else if( i_partition == D_16x8 )
3149 for( int i = 0; i < 2; i++ )
3151 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3152 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3153 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3154 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3155 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3156 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3158 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3159 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3163 else if( i_partition == D_8x16 )
3165 for( int i = 0; i < 2; i++ )
3167 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3168 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3169 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3170 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3171 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3172 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3174 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3175 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3179 else if( i_partition == D_8x8 )
3181 for( int i = 0; i < 4; i++ )
3183 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3184 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3185 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3186 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3187 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3189 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3190 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3198 x264_analyse_update_cache( h, &analysis );
3200 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3201 * without realizing it. Check for this and account for it if necessary. */
3202 if( analysis.i_mbrd >= 2 )
3204 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3205 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3206 int list = check_mv_lists[h->mb.i_type] - 1;
3207 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3208 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3209 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3210 h->mb.i_partition = D_16x16;
3213 if( !analysis.i_mbrd )
3214 x264_mb_analyse_transform( h );
3216 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3217 x264_mb_analyse_qp_rd( h, &analysis );
3219 h->mb.b_trellis = h->param.analyse.i_trellis;
3220 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3221 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3222 x264_psy_trellis_init( h, 0 );
3223 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3224 h->mb.i_skip_intra = 0;
3227 /*-------------------- Update MB from the analysis ----------------------*/
3228 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3230 switch( h->mb.i_type )
3233 for( int i = 0; i < 16; i++ )
3234 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3236 x264_mb_analyse_intra_chroma( h, a );
3239 for( int i = 0; i < 4; i++ )
3240 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3242 x264_mb_analyse_intra_chroma( h, a );
3245 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3246 x264_mb_analyse_intra_chroma( h, a );
3253 switch( h->mb.i_partition )
3256 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3257 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3261 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3262 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3263 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3264 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3268 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3269 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3270 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3271 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3275 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3281 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3282 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3283 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3284 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3285 for( int i = 0; i < 4; i++ )
3286 x264_mb_cache_mv_p8x8( h, a, i );
3291 h->mb.i_partition = D_16x16;
3292 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3293 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3299 h->mb.i_partition = h->mb.cache.direct_partition;
3300 x264_mb_load_mv_direct8x8( h, 0 );
3301 x264_mb_load_mv_direct8x8( h, 1 );
3302 x264_mb_load_mv_direct8x8( h, 2 );
3303 x264_mb_load_mv_direct8x8( h, 3 );
3307 /* optimize: cache might not need to be rewritten */
3308 for( int i = 0; i < 4; i++ )
3309 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3312 default: /* the rest of the B types */
3313 switch( h->mb.i_partition )
3316 switch( h->mb.i_type )
3319 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3320 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3322 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3323 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3324 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3327 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3328 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3329 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3331 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3332 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3335 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3336 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3338 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3339 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3344 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3345 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3348 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3349 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3352 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3358 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3360 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3363 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3366 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3367 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3369 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3370 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3371 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3372 h->mb.cache.mv[l][x264_scan8[15]][0],
3373 h->mb.cache.mv[l][x264_scan8[15]][1] );
3374 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3375 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3376 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3377 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3378 x264_mb_analyse_intra( h, a, COST_MAX );
3379 h->mb.i_type = I_16x16;
3380 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3381 x264_mb_analyse_intra_chroma( h, a );
3388 #include "slicetype.c"