1 /*****************************************************************************
2 * analyse.c: macroblock analysis
3 *****************************************************************************
4 * Copyright (C) 2003-2016 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
28 #include "common/common.h"
29 #include "macroblock.h"
31 #include "ratecontrol.h"
40 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
44 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
45 ALIGNED_4( int16_t mvc[32][5][2] );
49 int i_cost4x4[4]; /* cost per 8x8 partition */
50 x264_me_t me4x4[4][4];
53 int i_cost8x4[4]; /* cost per 8x8 partition */
54 x264_me_t me8x4[4][2];
57 int i_cost4x8[4]; /* cost per 8x8 partition */
58 x264_me_t me4x8[4][2];
68 } x264_mb_analysis_list_t;
72 /* conduct the analysis using this lamda and QP */
77 uint16_t *p_cost_ref[2];
82 /* Take some shortcuts in intra search if intra is deemed unlikely */
84 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
85 int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
90 int i_satd_i16x16_dir[7];
95 ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
105 int i_satd_chroma_dir[7];
106 int i_predict8x8chroma;
108 /* II: Inter part P/B frame */
109 x264_mb_analysis_list_t l0;
110 x264_mb_analysis_list_t l1;
112 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
113 int i_cost16x16direct;
115 int i_cost8x8direct[4];
116 int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
117 int i_cost_est16x8[2]; /* Per-partition estimated cost */
118 int i_cost_est8x16[2];
127 int i_mb_partition16x8[2]; /* mb_partition_e */
128 int i_mb_partition8x16[2];
129 int i_mb_type16x8; /* mb_class_e */
132 int b_direct_available;
133 int b_early_terminate;
135 } x264_mb_analysis_t;
137 /* lambda = pow(2,qp/6-2) */
138 const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
140 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */
141 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */
142 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */
143 4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */
144 10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */
145 25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */
146 64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */
147 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
148 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
149 1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
150 2048,2299, /* 80-81 */
153 /* lambda2 = pow(lambda,2) * .9 * 256 */
154 /* Capped to avoid overflow */
155 const int x264_lambda2_tab[QP_MAX_MAX+1] =
157 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */
158 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */
159 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */
160 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */
161 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */
162 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */
163 943718, 1189010, 1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
164 5992238, 7549747, 9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
165 38048341, 47937906, 60397977, 76096683, 95875813,120795955, /* 64-69 */
166 134217727,134217727,134217727,134217727,134217727,134217727, /* 70-75 */
167 134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */
170 const uint8_t x264_exp2_lut[64] =
172 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
173 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
174 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
175 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
178 const float x264_log2_lut[128] =
180 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
181 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
182 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
183 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
184 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
185 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
186 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
187 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
188 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
189 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
190 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
191 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
192 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
193 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
194 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
195 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
198 /* Avoid an int/float conversion. */
199 const float x264_log2_lz_lut[32] =
201 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
204 // should the intra and inter lambdas be different?
205 // I'm just matching the behaviour of deadzone quant.
206 static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
208 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
210 46, 58, 73, 92, 117, 147,
211 185, 233, 294, 370, 466, 587,
212 740, 932, 1174, 1480, 1864, 2349,
213 2959, 3728, 4697, 5918, 7457, 9395,
214 11837, 14914, 18790, 23674, 29828, 37581,
215 47349, 59656, 75163, 94699, 119313, 150326,
216 189399, 238627, 300652, 378798, 477255, 601304,
217 757596, 954511, 1202608, 1515192, 1909022, 2405217,
218 3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
219 12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
220 48486154, 61088726, 76966972, 96972308,
221 122177453,134217727,134217727,134217727,134217727,134217727,
222 134217727,134217727,134217727,134217727,134217727,134217727,
224 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
226 27, 34, 43, 54, 68, 86,
227 108, 136, 172, 216, 273, 343,
228 433, 545, 687, 865, 1090, 1374,
229 1731, 2180, 2747, 3461, 4361, 5494,
230 6922, 8721, 10988, 13844, 17442, 21976,
231 27688, 34885, 43953, 55377, 69771, 87906,
232 110755, 139543, 175813, 221511, 279087, 351627,
233 443023, 558174, 703255, 886046, 1116348, 1406511,
234 1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
235 7088374, 8930791, 11252092, 14176748, 17861583, 22504184,
236 28353495, 35723165, 45008368, 56706990,
237 71446330, 90016736,113413980,134217727,134217727,134217727,
238 134217727,134217727,134217727,134217727,134217727,134217727,
239 134217727,134217727,134217727,134217727,134217727,134217727,
243 #define MAX_CHROMA_LAMBDA_OFFSET 36
244 static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
246 16, 20, 25, 32, 40, 50,
247 64, 80, 101, 128, 161, 203,
248 256, 322, 406, 512, 645, 812,
249 1024, 1290, 1625, 2048, 2580, 3250,
250 4096, 5160, 6501, 8192, 10321, 13003,
251 16384, 20642, 26007, 32768, 41285, 52015,
255 /* TODO: calculate CABAC costs */
256 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
258 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
260 static const uint8_t i_mb_b16x8_cost_table[17] =
262 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
264 static const uint8_t i_sub_mb_b_cost_table[13] =
266 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
268 static const uint8_t i_sub_mb_p_cost_table[4] =
273 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
275 static uint16_t x264_cost_ref[QP_MAX+1][3][33];
276 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
277 static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
279 static int init_costs( x264_t *h, float *logs, int qp )
281 int lambda = x264_lambda_tab[qp];
284 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
285 CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
286 h->cost_mv[qp] += 2*4*2048;
287 for( int i = 0; i <= 2*4*2048; i++ )
290 h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
292 x264_pthread_mutex_lock( &cost_ref_mutex );
293 for( int i = 0; i < 3; i++ )
294 for( int j = 0; j < 33; j++ )
295 x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
296 x264_pthread_mutex_unlock( &cost_ref_mutex );
297 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
299 for( int j = 0; j < 4; j++ )
301 CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
302 h->cost_mv_fpel[qp][j] += 2*2048;
303 for( int i = -2*2048; i < 2*2048; i++ )
304 h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
307 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
308 for( int i = 0; i < 17; i++ )
309 cost_i4x4_mode[i] = 3*lambda*(i!=8);
315 int x264_analyse_init_costs( x264_t *h )
317 float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
322 for( int i = 1; i <= 2*4*2048; i++ )
323 logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
325 for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
326 if( init_costs( h, logs, qp ) )
329 if( init_costs( h, logs, X264_LOOKAHEAD_QP ) )
339 void x264_analyse_free_costs( x264_t *h )
341 for( int i = 0; i < QP_MAX+1; i++ )
344 x264_free( h->cost_mv[i] - 2*4*2048 );
345 if( h->cost_mv_fpel[i][0] )
346 for( int j = 0; j < 4; j++ )
347 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
351 void x264_analyse_weight_frame( x264_t *h, int end )
353 for( int j = 0; j < h->i_ref[0]; j++ )
355 if( h->sh.weight[j][0].weightfn )
357 x264_frame_t *frame = h->fref[0][j];
358 int width = frame->i_width[0] + 2*PADH;
359 int i_padv = PADV << PARAM_INTERLACED;
361 pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
362 height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
363 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
364 h->fenc->i_lines_weighted += height;
366 for( int k = j; k < h->i_ref[0]; k++ )
367 if( h->sh.weight[k][0].weightfn )
369 pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
370 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
371 src + offset, frame->i_stride[0],
372 width, height, &h->sh.weight[k][0] );
379 /* initialize an array of lambda*nbits for all possible mvs */
380 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
382 a->p_cost_mv = h->cost_mv[a->i_qp];
383 a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
384 a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
387 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
389 int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
390 a->i_lambda = x264_lambda_tab[qp];
391 a->i_lambda2 = x264_lambda2_tab[qp];
393 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
394 if( h->param.analyse.i_trellis )
396 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
397 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
398 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
399 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
401 h->mb.i_psy_rd_lambda = a->i_lambda;
402 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
403 int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
404 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
406 if( qp > QP_MAX_SPEC )
408 h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
409 h->nr_residual_sum = h->nr_residual_sum_buf[1];
410 h->nr_count = h->nr_count_buf[1];
411 h->mb.b_noise_reduction = 1;
412 qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
416 h->nr_offset = h->nr_offset_denoise;
417 h->nr_residual_sum = h->nr_residual_sum_buf[0];
418 h->nr_count = h->nr_count_buf[0];
419 h->mb.b_noise_reduction = 0;
422 a->i_qp = h->mb.i_qp = qp;
423 h->mb.i_chroma_qp = h->chroma_qp_table[qp];
426 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
428 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
430 /* mbrd == 1 -> RD mode decision */
431 /* mbrd == 2 -> RD refinement */
432 /* mbrd == 3 -> QPRD */
433 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
434 h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
435 a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
437 x264_mb_analyse_init_qp( h, a, qp );
439 h->mb.b_transform_8x8 = 0;
445 a->i_satd_chroma = COST_MAX;
447 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
448 * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
449 uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
450 a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
453 a->b_avoid_topright = 0;
455 h->mb.b_lossless ? 0 :
457 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
459 /* II: Inter part P/B frame */
460 if( h->sh.i_type != SLICE_TYPE_I )
462 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
463 // limit motion search to a slightly smaller range than the theoretical limit,
464 // since the search may go a few iterations past its given range
465 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
467 /* Calculate max allowed MV range */
468 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
469 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
470 h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
471 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
472 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
473 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
475 int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
476 int max_mv = max_x - 4*16*h->mb.i_mb_x;
477 /* If we're left of the refresh bar, don't reference right of it. */
478 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
479 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
481 h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
482 h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
483 if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
485 int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
486 int thread_mvy_range = i_fmv_range;
488 if( h->i_thread_frames > 1 )
490 int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
491 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
492 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
493 for( int j = 0; j < h->i_ref[i]; j++ )
495 x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
496 thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
499 if( h->param.b_deterministic )
500 thread_mvy_range = h->param.analyse.i_mv_range_thread;
501 if( PARAM_INTERLACED )
502 thread_mvy_range >>= 1;
504 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
507 if( PARAM_INTERLACED )
509 /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
510 for( int i = 0; i < 3; i++ )
513 mb_y = (h->mb.i_mb_y >> j) + (i == 1);
514 h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
515 h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
516 h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
517 h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
518 h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
519 h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
520 h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
525 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
526 h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
527 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
528 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
529 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
530 h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
531 h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
534 if( PARAM_INTERLACED )
536 int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
537 h->mb.mv_min[1] = h->mb.mv_miny_row[i];
538 h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
539 h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
540 h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
541 h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
542 h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
550 a->l0.i_cost8x16 = COST_MAX;
551 if( h->sh.i_type == SLICE_TYPE_B )
556 a->i_cost8x8direct[0] =
557 a->i_cost8x8direct[1] =
558 a->i_cost8x8direct[2] =
559 a->i_cost8x8direct[3] =
568 a->i_cost16x16direct =
571 a->i_cost8x16bi = COST_MAX;
573 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
574 for( int i = 0; i < 4; i++ )
578 a->l0.i_cost4x8[i] = COST_MAX;
581 /* Fast intra decision */
582 if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
584 /* Always run in fast-intra mode for subme < 3 */
585 if( h->mb.i_subpel_refine > 2 &&
586 ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
587 IS_INTRA( h->mb.i_mb_type_top ) ||
588 IS_INTRA( h->mb.i_mb_type_topleft ) ||
589 IS_INTRA( h->mb.i_mb_type_topright ) ||
590 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
591 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
592 { /* intra is likely */ }
599 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
600 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
602 a->b_force_intra = 1;
604 a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
607 a->b_force_intra = 0;
611 /* Prediction modes allowed for various combinations of neighbors. */
612 /* Terminated by a -1. */
613 /* In order, no neighbors, left, top, top/left, top/left/topleft */
614 static const int8_t i16x16_mode_available[5][5] =
616 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
617 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
618 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
619 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
620 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
623 static const int8_t chroma_mode_available[5][5] =
625 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
626 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
627 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
628 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
629 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
632 static const int8_t i8x8_mode_available[2][5][10] =
635 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
636 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
637 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
638 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
639 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
642 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
643 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
644 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
645 {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
646 {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
650 static const int8_t i4x4_mode_available[2][5][10] =
653 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
654 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
655 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
656 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
657 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
660 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
661 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
662 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
663 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
664 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
668 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
670 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
671 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
672 return i16x16_mode_available[idx];
675 static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
677 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
678 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
679 return chroma_mode_available[idx];
682 static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
684 int avoid_topright = force_intra && (i&1);
685 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
686 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
687 return i8x8_mode_available[avoid_topright][idx];
690 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
692 int avoid_topright = force_intra && ((i&5) == 5);
693 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
694 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
695 return i4x4_mode_available[avoid_topright][idx];
698 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
699 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
701 ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
703 if( do_both_dct || h->mb.b_transform_8x8 )
704 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
705 if( do_both_dct || !h->mb.b_transform_8x8 )
706 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
709 /* Reset fenc satd scores cache for psy RD */
710 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
712 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
713 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
714 if( !h->mb.i_psy_rd )
716 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
717 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
719 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
722 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
724 if( a->i_satd_chroma < COST_MAX )
729 if( !h->mb.b_chroma_me )
731 a->i_satd_chroma = 0;
735 /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
736 if( h->mb.b_lossless )
738 x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
739 x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
743 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
744 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
746 a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
747 + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
751 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
752 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
754 /* Prediction selection for chroma */
755 if( predict_mode[3] >= 0 && !h->mb.b_lossless )
757 int satdu[4], satdv[4];
758 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
759 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
760 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
761 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
762 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
763 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
765 for( ; *predict_mode >= 0; predict_mode++ )
767 int i_mode = *predict_mode;
768 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
770 a->i_satd_chroma_dir[i_mode] = i_satd;
771 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
776 for( ; *predict_mode >= 0; predict_mode++ )
779 int i_mode = *predict_mode;
781 /* we do the prediction */
782 if( h->mb.b_lossless )
783 x264_predict_lossless_chroma( h, i_mode );
786 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
787 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
790 /* we calculate the cost */
791 i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
792 h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
793 a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
795 a->i_satd_chroma_dir[i_mode] = i_satd;
796 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
800 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
803 /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
804 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
806 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
807 pixel *p_src = h->mb.pic.p_fenc[0];
808 pixel *p_dst = h->mb.pic.p_fdec[0];
809 static const int8_t intra_analysis_shortcut[2][2][2][5] =
811 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
812 {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
813 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
814 {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
815 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
816 {-1, -1, -1, -1, -1}},
817 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
818 {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
822 int lambda = a->i_lambda;
824 /*---------------- Try all mode and calculate their score ---------------*/
825 /* Disabled i16x16 for AVC-Intra compat */
826 if( !h->param.i_avcintra_class )
828 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
830 /* Not heavily tuned */
831 static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
832 int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
834 if( !h->mb.b_lossless && predict_mode[3] >= 0 )
836 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
837 a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
838 a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
839 a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
840 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
841 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
842 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
844 /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
845 if( a->i_satd_i16x16 <= i16x16_thresh )
847 h->predict_16x16[I_PRED_16x16_P]( p_dst );
848 a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
849 a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
850 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
855 for( ; *predict_mode >= 0; predict_mode++ )
858 int i_mode = *predict_mode;
860 if( h->mb.b_lossless )
861 x264_predict_lossless_16x16( h, 0, i_mode );
863 h->predict_16x16[i_mode]( p_dst );
865 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
866 lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
867 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
868 a->i_satd_i16x16_dir[i_mode] = i_satd;
872 if( h->sh.i_type == SLICE_TYPE_B )
873 /* cavlc mb type prefix */
874 a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
876 if( a->i_satd_i16x16 > i16x16_thresh )
880 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
881 /* 8x8 prediction selection */
882 if( flags & X264_ANALYSE_I8x8 )
884 ALIGNED_ARRAY_32( pixel, edge,[36] );
885 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
886 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
888 // FIXME some bias like in i4x4?
889 int i_cost = lambda * 4; /* base predmode costs */
890 h->mb.i_cbp_luma = 0;
892 if( h->sh.i_type == SLICE_TYPE_B )
893 i_cost += lambda * i_mb_b_cost_table[I_8x8];
895 for( idx = 0;; idx++ )
899 pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
900 pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
901 int i_best = COST_MAX;
902 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
904 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
905 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
907 if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
909 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
910 i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
911 i_cost += i_best & 0xffff;
913 a->i_predict8x8[idx] = i_best;
914 if( idx == 3 || i_cost > i_satd_thresh )
916 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
920 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
922 ALIGNED_ARRAY_16( int32_t, satd,[9] );
923 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
924 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
925 satd[i_pred_mode] -= 3 * lambda;
926 for( int i = 2; i >= 0; i-- )
929 a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
930 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
933 /* Take analysis shortcuts: don't analyse modes that are too
934 * far away direction-wise from the favored mode. */
935 if( a->i_mbrd < 1 + a->b_fast_intra )
936 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
941 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
944 int i_mode = *predict_mode;
946 if( h->mb.b_lossless )
947 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
949 h->predict_8x8[i_mode]( p_dst_by, edge );
951 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
952 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
953 i_satd -= 3 * lambda;
955 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
956 a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
958 i_cost += i_best + 3*lambda;
960 if( idx == 3 || i_cost > i_satd_thresh )
962 if( h->mb.b_lossless )
963 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
965 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
966 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
968 /* we need to encode this block now (for next ones) */
969 x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
974 a->i_satd_i8x8 = i_cost;
975 if( h->mb.i_skip_intra )
977 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
978 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
979 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
980 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
981 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
982 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
983 if( h->mb.i_skip_intra == 2 )
984 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
989 static const uint16_t cost_div_fix8[3] = {1024,512,341};
990 a->i_satd_i8x8 = COST_MAX;
991 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
993 /* Not heavily tuned */
994 static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
995 if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
999 /* 4x4 prediction selection */
1000 if( flags & X264_ANALYSE_I4x4 )
1002 int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
1003 int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
1004 h->mb.i_cbp_luma = 0;
1006 if( a->b_early_terminate && a->i_mbrd )
1007 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
1009 if( h->sh.i_type == SLICE_TYPE_B )
1010 i_cost += lambda * i_mb_b_cost_table[I_4x4];
1012 for( idx = 0;; idx++ )
1014 pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
1015 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1016 int i_best = COST_MAX;
1017 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
1019 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1021 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1022 /* emulate missing topright samples */
1023 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1025 if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
1027 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
1028 i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
1029 i_cost += i_best & 0xffff;
1031 a->i_predict4x4[idx] = i_best;
1032 if( i_cost > i_satd_thresh || idx == 15 )
1034 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
1038 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
1040 ALIGNED_ARRAY_16( int32_t, satd,[9] );
1041 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
1042 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
1043 satd[i_pred_mode] -= 3 * lambda;
1044 i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
1045 COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
1046 COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
1048 /* Take analysis shortcuts: don't analyse modes that are too
1049 * far away direction-wise from the favored mode. */
1050 if( a->i_mbrd < 1 + a->b_fast_intra )
1051 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
1058 for( ; *predict_mode >= 0; predict_mode++ )
1061 int i_mode = *predict_mode;
1063 if( h->mb.b_lossless )
1064 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
1066 h->predict_4x4[i_mode]( p_dst_by );
1068 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
1069 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
1071 i_satd -= lambda * 3;
1075 a->i_predict4x4[idx] = i_mode;
1080 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
1084 i_cost += i_best + 3 * lambda;
1085 if( i_cost > i_satd_thresh || idx == 15 )
1087 if( h->mb.b_lossless )
1088 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
1090 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
1091 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1093 /* we need to encode this block now (for next ones) */
1094 x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
1098 a->i_satd_i4x4 = i_cost;
1099 if( h->mb.i_skip_intra )
1101 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1102 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1103 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1104 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1105 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1106 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1107 if( h->mb.i_skip_intra == 2 )
1108 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1112 a->i_satd_i4x4 = COST_MAX;
1116 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1118 if( !a->b_early_terminate )
1119 i_satd_thresh = COST_MAX;
1121 if( a->i_satd_i16x16 < i_satd_thresh )
1123 h->mb.i_type = I_16x16;
1124 x264_analyse_update_cache( h, a );
1125 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1128 a->i_satd_i16x16 = COST_MAX;
1130 if( a->i_satd_i4x4 < i_satd_thresh )
1132 h->mb.i_type = I_4x4;
1133 x264_analyse_update_cache( h, a );
1134 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1137 a->i_satd_i4x4 = COST_MAX;
1139 if( a->i_satd_i8x8 < i_satd_thresh )
1141 h->mb.i_type = I_8x8;
1142 x264_analyse_update_cache( h, a );
1143 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1144 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1147 a->i_satd_i8x8 = COST_MAX;
1150 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1152 uint64_t i_satd, i_best;
1153 int plane_count = CHROMA444 ? 3 : 1;
1154 h->mb.i_skip_intra = 0;
1156 if( h->mb.i_type == I_16x16 )
1158 int old_pred_mode = a->i_predict16x16;
1159 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1160 int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
1161 i_best = a->i_satd_i16x16;
1162 for( ; *predict_mode >= 0; predict_mode++ )
1164 int i_mode = *predict_mode;
1165 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1167 h->mb.i_intra16x16_pred_mode = i_mode;
1168 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1169 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1173 /* RD selection for chroma prediction */
1176 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
1177 if( predict_mode[1] >= 0 )
1179 int8_t predict_mode_sorted[4];
1181 int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
1183 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1185 int i_mode = *predict_mode;
1186 if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1187 predict_mode_sorted[i_max++] = i_mode;
1192 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1193 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1194 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1195 * coefs for the current chroma mode are still around, so we only
1196 * have to recount the bits. */
1197 i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1198 for( int i = 0; i < i_max; i++ )
1200 int i_mode = predict_mode_sorted[i];
1201 if( h->mb.b_lossless )
1202 x264_predict_lossless_chroma( h, i_mode );
1205 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
1206 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
1208 /* if we've already found a mode that needs no residual, then
1209 * probably any mode with a residual will be worse.
1210 * so avoid dct on the remaining modes to improve speed. */
1211 i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1212 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1214 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1215 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1220 if( h->mb.i_type == I_4x4 )
1222 pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
1224 for( int idx = 0; idx < 16; idx++ )
1226 pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
1227 h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
1228 h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
1229 i_best = COST_MAX64;
1231 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1233 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1234 for( int p = 0; p < plane_count; p++ )
1235 /* emulate missing topright samples */
1236 MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
1238 for( ; *predict_mode >= 0; predict_mode++ )
1240 int i_mode = *predict_mode;
1241 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1243 if( i_best > i_satd )
1245 a->i_predict4x4[idx] = i_mode;
1247 for( int p = 0; p < plane_count; p++ )
1249 pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
1250 pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
1251 pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
1252 pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
1253 nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
1258 for( int p = 0; p < plane_count; p++ )
1260 MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
1261 MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
1262 MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
1263 MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
1264 h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
1267 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1270 else if( h->mb.i_type == I_8x8 )
1272 ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
1273 pixel4 pels_h[3][2] = {{0}};
1274 pixel pels_v[3][7] = {{0}};
1275 uint16_t nnz[3][2] = {{0}}; //shut up gcc
1276 for( int idx = 0; idx < 4; idx++ )
1280 int s8 = X264_SCAN8_0 + 2*x + 16*y;
1281 pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
1282 h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
1283 h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
1284 int cbp_luma_new = 0;
1285 int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
1287 i_best = COST_MAX64;
1289 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1290 for( int p = 0; p < plane_count; p++ )
1291 h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1293 for( ; *predict_mode >= 0; predict_mode++ )
1295 int i_mode = *predict_mode;
1296 if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
1299 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1300 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
1302 if( i_best > i_satd )
1304 a->i_predict8x8[idx] = i_mode;
1305 cbp_luma_new = h->mb.i_cbp_luma;
1308 for( int p = 0; p < plane_count; p++ )
1310 pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
1311 pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
1313 for( int j = 0; j < 7; j++ )
1314 pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
1315 nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
1316 nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
1320 a->i_cbp_i8x8_luma = cbp_luma_new;
1321 for( int p = 0; p < plane_count; p++ )
1323 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
1324 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
1326 for( int j = 0; j < 7; j++ )
1327 dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
1328 M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
1329 M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
1332 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1337 #define LOAD_FENC(m, src, xoff, yoff) \
1339 (m)->p_cost_mv = a->p_cost_mv; \
1340 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1341 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1342 (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
1343 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1344 (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1345 (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1348 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1350 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1351 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1352 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1353 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1356 (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
1357 (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
1358 (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
1359 (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
1360 (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
1361 (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
1362 (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
1363 (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
1366 (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
1367 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1368 (m)->weight = x264_weight_none; \
1372 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1373 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1374 (m)->weight = h->sh.weight[i_ref];
1376 #define REF_COST(list, ref) \
1377 (a->p_cost_ref[list][ref])
1379 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1383 ALIGNED_4( int16_t mvc[8][2] );
1384 int i_halfpel_thresh = INT_MAX;
1385 int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
1387 /* 16x16 Search on all ref frame */
1388 m.i_pixel = PIXEL_16x16;
1389 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1391 a->l0.me16x16.cost = INT_MAX;
1392 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1394 m.i_ref_cost = REF_COST( 0, i_ref );
1395 i_halfpel_thresh -= m.i_ref_cost;
1397 /* search with ref */
1398 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1399 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1401 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1403 if( h->mb.ref_blind_dupe == i_ref )
1405 CP32( m.mv, a->l0.mvc[0][0] );
1406 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1410 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1411 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1414 /* save mv for predicting neighbors */
1415 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1416 CP32( a->l0.mvc[i_ref][0], m.mv );
1418 /* early termination
1419 * SSD threshold would probably be better than SATD */
1422 && m.cost-m.cost_mv < 300*a->i_lambda
1423 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1424 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1425 && x264_macroblock_probe_pskip( h ) )
1427 h->mb.i_type = P_SKIP;
1428 x264_analyse_update_cache( h, a );
1429 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1433 m.cost += m.i_ref_cost;
1434 i_halfpel_thresh += m.i_ref_cost;
1436 if( m.cost < a->l0.me16x16.cost )
1437 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1440 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1441 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1443 h->mb.i_type = P_L0;
1446 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1447 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1449 h->mb.i_partition = D_16x16;
1450 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1451 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1452 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1453 h->mb.i_type = P_SKIP;
1458 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1461 pixel **p_fenc = h->mb.pic.p_fenc;
1462 int i_maxref = h->mb.pic.i_fref[0]-1;
1464 h->mb.i_partition = D_8x8;
1466 #define CHECK_NEIGHBOUR(i)\
1468 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1469 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1473 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1474 * than those used by the neighbors */
1475 if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1476 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
1479 CHECK_NEIGHBOUR( -8 - 1 );
1480 CHECK_NEIGHBOUR( -8 + 0 );
1481 CHECK_NEIGHBOUR( -8 + 2 );
1482 CHECK_NEIGHBOUR( -8 + 4 );
1483 CHECK_NEIGHBOUR( 0 - 1 );
1484 CHECK_NEIGHBOUR( 2*8 - 1 );
1486 #undef CHECK_NEIGHBOUR
1488 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1489 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1491 for( int i = 0; i < 4; i++ )
1493 x264_me_t *l0m = &a->l0.me8x8[i];
1497 m.i_pixel = PIXEL_8x8;
1499 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1500 l0m->cost = INT_MAX;
1501 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1503 m.i_ref_cost = REF_COST( 0, i_ref );
1505 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1506 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1508 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1509 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1510 if( h->mb.ref_blind_dupe == i_ref )
1512 CP32( m.mv, a->l0.mvc[0][i+1] );
1513 x264_me_refine_qpel_refdupe( h, &m, NULL );
1516 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1518 m.cost += m.i_ref_cost;
1520 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1522 if( m.cost < l0m->cost )
1523 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1524 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1525 i_ref = h->mb.ref_blind_dupe;
1529 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1530 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1532 a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1534 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1535 are effectively zero. */
1536 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1537 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1540 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1541 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1542 /* P_8x8 ref0 has no ref cost */
1543 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1544 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1545 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1546 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1547 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1550 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1552 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1553 * reference frame flags. Thus, if we're not doing mixedrefs, just
1554 * don't bother analysing the dupes. */
1555 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1556 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1557 pixel **p_fenc = h->mb.pic.p_fenc;
1559 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1561 /* XXX Needed for x264_mb_predict_mv */
1562 h->mb.i_partition = D_8x8;
1565 CP32( mvc[0], a->l0.me16x16.mv );
1567 for( int i = 0; i < 4; i++ )
1569 x264_me_t *m = &a->l0.me8x8[i];
1573 m->i_pixel = PIXEL_8x8;
1574 m->i_ref_cost = i_ref_cost;
1576 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1577 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1578 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1580 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1581 x264_me_search( h, m, mvc, i_mvc );
1583 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1585 CP32( mvc[i_mvc], m->mv );
1588 a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1591 m->cost += i_ref_cost;
1592 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1593 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1596 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1597 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1598 /* theoretically this should include 4*ref_cost,
1599 * but 3 seems a better approximation of cabac. */
1600 if( h->param.b_cabac )
1601 a->l0.i_cost8x8 -= i_ref_cost;
1602 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1603 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1606 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1609 pixel **p_fenc = h->mb.pic.p_fenc;
1610 ALIGNED_4( int16_t mvc[3][2] );
1612 /* XXX Needed for x264_mb_predict_mv */
1613 h->mb.i_partition = D_16x8;
1615 for( int i = 0; i < 2; i++ )
1617 x264_me_t *l0m = &a->l0.me16x8[i];
1618 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1619 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1620 const int ref8[2] = { minref, maxref };
1621 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1623 m.i_pixel = PIXEL_16x8;
1625 LOAD_FENC( &m, p_fenc, 0, 8*i );
1626 l0m->cost = INT_MAX;
1627 for( int j = 0; j < i_ref8s; j++ )
1629 const int i_ref = ref8[j];
1630 m.i_ref_cost = REF_COST( 0, i_ref );
1632 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1633 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1634 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1635 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1637 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1638 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1640 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1641 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1642 /* We can only take this shortcut if the first search was performed on ref0. */
1643 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1645 /* We can just leave the MV from the previous ref search. */
1646 x264_me_refine_qpel_refdupe( h, &m, NULL );
1649 x264_me_search( h, &m, mvc, 3 );
1651 m.cost += m.i_ref_cost;
1653 if( m.cost < l0m->cost )
1654 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1657 /* Early termination based on the current SATD score of partition[0]
1658 plus the estimated SATD score of partition[1] */
1659 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1661 a->l0.i_cost16x8 = COST_MAX;
1665 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1666 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1669 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1672 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1675 pixel **p_fenc = h->mb.pic.p_fenc;
1676 ALIGNED_4( int16_t mvc[3][2] );
1678 /* XXX Needed for x264_mb_predict_mv */
1679 h->mb.i_partition = D_8x16;
1681 for( int i = 0; i < 2; i++ )
1683 x264_me_t *l0m = &a->l0.me8x16[i];
1684 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1685 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1686 const int ref8[2] = { minref, maxref };
1687 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1689 m.i_pixel = PIXEL_8x16;
1691 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1692 l0m->cost = INT_MAX;
1693 for( int j = 0; j < i_ref8s; j++ )
1695 const int i_ref = ref8[j];
1696 m.i_ref_cost = REF_COST( 0, i_ref );
1698 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1699 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1700 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1702 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1703 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1705 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1706 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1707 /* We can only take this shortcut if the first search was performed on ref0. */
1708 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1710 /* We can just leave the MV from the previous ref search. */
1711 x264_me_refine_qpel_refdupe( h, &m, NULL );
1714 x264_me_search( h, &m, mvc, 3 );
1716 m.cost += m.i_ref_cost;
1718 if( m.cost < l0m->cost )
1719 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1722 /* Early termination based on the current SATD score of partition[0]
1723 plus the estimated SATD score of partition[1] */
1724 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1726 a->l0.i_cost8x16 = COST_MAX;
1730 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1731 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1734 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1737 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
1738 pixel **p_fref, int i8x8, int size, int chroma )
1740 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1741 pixel *pix2 = pix1+8;
1742 int i_stride = h->mb.pic.i_stride[1];
1743 int chroma_h_shift = chroma <= CHROMA_422;
1744 int chroma_v_shift = chroma == CHROMA_420;
1745 int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
1746 int i_ref = a->l0.me8x8[i8x8].i_ref;
1747 int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1748 x264_weight_t *weight = h->sh.weight[i_ref];
1750 // FIXME weight can be done on 4x4 blocks even if mc is smaller
1751 #define CHROMA4x4MC( width, height, me, x, y ) \
1752 if( chroma == CHROMA_444 ) \
1754 int mvx = (me).mv[0] + 4*2*x; \
1755 int mvy = (me).mv[1] + 4*2*y; \
1756 h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
1757 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
1758 h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
1759 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
1763 int offset = x + (2>>chroma_v_shift)*16*y; \
1764 int chroma_height = (2>>chroma_v_shift)*height; \
1765 h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
1766 (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
1767 if( weight[1].weightfn ) \
1768 weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
1769 if( weight[2].weightfn ) \
1770 weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
1773 if( size == PIXEL_4x4 )
1775 x264_me_t *m = a->l0.me4x4[i8x8];
1776 CHROMA4x4MC( 2,2, m[0], 0,0 );
1777 CHROMA4x4MC( 2,2, m[1], 2,0 );
1778 CHROMA4x4MC( 2,2, m[2], 0,2 );
1779 CHROMA4x4MC( 2,2, m[3], 2,2 );
1781 else if( size == PIXEL_8x4 )
1783 x264_me_t *m = a->l0.me8x4[i8x8];
1784 CHROMA4x4MC( 4,2, m[0], 0,0 );
1785 CHROMA4x4MC( 4,2, m[1], 0,2 );
1789 x264_me_t *m = a->l0.me4x8[i8x8];
1790 CHROMA4x4MC( 2,4, m[0], 0,0 );
1791 CHROMA4x4MC( 2,4, m[1], 2,0 );
1795 int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
1796 int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
1797 return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1798 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1801 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1803 if( CHROMA_FORMAT == CHROMA_444 )
1804 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
1805 else if( CHROMA_FORMAT == CHROMA_422 )
1806 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
1808 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
1811 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1813 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1814 pixel **p_fenc = h->mb.pic.p_fenc;
1815 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1817 /* XXX Needed for x264_mb_predict_mv */
1818 h->mb.i_partition = D_8x8;
1820 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1822 const int idx = 4*i8x8 + i4x4;
1823 const int x4 = block_idx_x[idx];
1824 const int y4 = block_idx_y[idx];
1825 const int i_mvc = (i4x4 == 0);
1827 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1829 m->i_pixel = PIXEL_4x4;
1831 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1832 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1833 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1835 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1836 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1838 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1840 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1841 a->l0.me4x4[i8x8][1].cost +
1842 a->l0.me4x4[i8x8][2].cost +
1843 a->l0.me4x4[i8x8][3].cost +
1844 REF_COST( 0, i_ref ) +
1845 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1846 if( h->mb.b_chroma_me )
1847 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1850 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1852 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1853 pixel **p_fenc = h->mb.pic.p_fenc;
1854 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1856 /* XXX Needed for x264_mb_predict_mv */
1857 h->mb.i_partition = D_8x8;
1859 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1861 const int idx = 4*i8x8 + 2*i8x4;
1862 const int x4 = block_idx_x[idx];
1863 const int y4 = block_idx_y[idx];
1864 const int i_mvc = (i8x4 == 0);
1866 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1868 m->i_pixel = PIXEL_8x4;
1870 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1871 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1872 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1874 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1875 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1877 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1879 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1880 REF_COST( 0, i_ref ) +
1881 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1882 if( h->mb.b_chroma_me )
1883 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1886 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1888 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1889 pixel **p_fenc = h->mb.pic.p_fenc;
1890 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1892 /* XXX Needed for x264_mb_predict_mv */
1893 h->mb.i_partition = D_8x8;
1895 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1897 const int idx = 4*i8x8 + i4x8;
1898 const int x4 = block_idx_x[idx];
1899 const int y4 = block_idx_y[idx];
1900 const int i_mvc = (i4x8 == 0);
1902 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1904 m->i_pixel = PIXEL_4x8;
1906 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1907 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1908 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1910 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1911 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1913 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1915 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1916 REF_COST( 0, i_ref ) +
1917 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1918 if( h->mb.b_chroma_me )
1919 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1922 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1924 ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
1925 ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
1926 int i_chroma_cost = 0;
1927 int chromapix = h->luma2chroma_pixel[i_pixel];
1929 #define COST_BI_CHROMA( m0, m1, width, height ) \
1933 h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
1934 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1935 h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
1936 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1937 h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
1938 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1939 h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
1940 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1944 int v_shift = CHROMA_V_SHIFT; \
1945 int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1946 int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1947 h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
1948 m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1949 h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
1950 m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1952 h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1953 h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1954 i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
1955 + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
1958 if( i_pixel == PIXEL_16x16 )
1959 COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
1960 else if( i_pixel == PIXEL_16x8 )
1961 COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
1962 else if( i_pixel == PIXEL_8x16 )
1963 COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
1965 COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
1967 return i_chroma_cost;
1970 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1972 /* Assumes that fdec still contains the results of
1973 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1975 pixel *p_fenc = h->mb.pic.p_fenc[0];
1976 pixel *p_fdec = h->mb.pic.p_fdec[0];
1978 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1979 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1981 int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
1983 for( int i = 0; i < 4; i++ )
1985 const int x = (i&1)*8;
1986 const int y = (i>>1)*8;
1987 a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1988 &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1989 if( h->mb.b_chroma_me )
1991 int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
1992 int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
1993 a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
1994 &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
1995 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
1996 &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
1998 a->i_cost16x16direct += a->i_cost8x8direct[i];
2001 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
2006 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
2007 if( h->mb.b_chroma_me )
2009 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2010 a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
2011 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
2016 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
2018 ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
2019 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
2021 intptr_t stride0 = 16, stride1 = 16;
2023 ALIGNED_4( int16_t mvc[9][2] );
2024 int try_skip = a->b_try_skip;
2025 int list1_skipped = 0;
2026 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
2027 int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
2028 (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
2031 m.i_pixel = PIXEL_16x16;
2033 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
2035 /* 16x16 Search on list 0 and list 1 */
2036 a->l0.me16x16.cost = INT_MAX;
2037 a->l1.me16x16.cost = INT_MAX;
2038 for( int l = 1; l >= 0; )
2040 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2042 /* This loop is extremely munged in order to facilitate the following order of operations,
2043 * necessary for an efficient fast skip.
2044 * 1. Search list1 ref0.
2045 * 2. Search list0 ref0.
2047 * 4. Search the rest of list0.
2048 * 5. Go back and finish list1.
2050 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
2052 if( try_skip && l == 1 && i_ref > 0 )
2058 m.i_ref_cost = REF_COST( l, i_ref );
2060 /* search with ref */
2061 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
2062 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
2063 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
2064 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
2067 m.cost += m.i_ref_cost;
2069 if( m.cost < lX->me16x16.cost )
2070 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
2072 /* save mv for predicting neighbors */
2073 CP32( lX->mvc[i_ref][0], m.mv );
2074 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
2076 /* Fast skip detection. */
2077 if( i_ref == 0 && try_skip )
2079 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
2080 abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
2086 /* We already tested skip */
2087 h->mb.i_type = B_SKIP;
2088 x264_analyse_update_cache( h, a );
2093 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
2095 if( list1_skipped && l == 0 )
2101 /* get cost of BI mode */
2102 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
2103 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
2104 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
2105 src0 = h->mc.get_ref( pix0, &stride0,
2106 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
2107 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
2108 src1 = h->mc.get_ref( pix1, &stride1,
2109 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
2110 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
2112 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2114 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2116 + a->l0.bi16x16.cost_mv
2117 + a->l1.bi16x16.cost_mv;
2119 if( h->mb.b_chroma_me )
2120 a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
2122 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
2123 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
2125 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
2126 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
2127 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
2128 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
2129 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2130 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2131 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2132 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2133 + ref_costs + l0_mv_cost + l1_mv_cost;
2135 if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
2137 ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
2141 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2142 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2143 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2144 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
2145 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2146 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2147 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2148 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
2152 ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
2153 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2154 int v_shift = CHROMA_V_SHIFT;
2156 if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
2158 int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2159 h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2160 h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
2163 h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2164 h->mb.pic.i_stride[1], 16>>v_shift );
2166 if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
2168 int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2169 h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2170 h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
2173 h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2174 h->mb.pic.i_stride[1], 16>>v_shift );
2176 h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
2177 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2178 h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
2179 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2181 cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
2182 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
2186 if( cost00 < a->i_cost16x16bi )
2188 M32( a->l0.bi16x16.mv ) = 0;
2189 M32( a->l1.bi16x16.mv ) = 0;
2190 a->l0.bi16x16.cost_mv = l0_mv_cost;
2191 a->l1.bi16x16.cost_mv = l1_mv_cost;
2192 a->i_cost16x16bi = cost00;
2197 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
2198 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
2199 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2202 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2207 switch( h->mb.i_sub_partition[i] )
2210 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2213 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2214 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2217 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2218 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2221 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2222 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2223 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2224 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2227 x264_log( h, X264_LOG_ERROR, "internal error\n" );
2232 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2236 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2237 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2238 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2239 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2242 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2243 if( x264_mb_partition_listX_table[0][part] ) \
2245 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2246 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2250 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2251 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
2253 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2255 if( x264_mb_partition_listX_table[1][part] ) \
2257 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2258 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2262 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2263 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
2265 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2268 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2272 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2274 x264_mb_load_mv_direct8x8( h, i );
2277 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
2278 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
2279 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2284 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2287 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2289 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2291 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2293 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2297 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2299 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2300 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2302 /* early termination: if 16x16 chose ref 0, then evalute no refs older
2303 * than those used by the neighbors */
2304 #define CHECK_NEIGHBOUR(i)\
2306 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2307 if( ref > i_maxref[l] )\
2311 for( int l = 0; l < 2; l++ )
2313 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2314 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2315 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2318 CHECK_NEIGHBOUR( -8 - 1 );
2319 CHECK_NEIGHBOUR( -8 + 0 );
2320 CHECK_NEIGHBOUR( -8 + 2 );
2321 CHECK_NEIGHBOUR( -8 + 4 );
2322 CHECK_NEIGHBOUR( 0 - 1 );
2323 CHECK_NEIGHBOUR( 2*8 - 1 );
2327 /* XXX Needed for x264_mb_predict_mv */
2328 h->mb.i_partition = D_8x8;
2332 for( int i = 0; i < 4; i++ )
2338 intptr_t stride[2] = {8,8};
2341 m.i_pixel = PIXEL_8x8;
2342 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2344 for( int l = 0; l < 2; l++ )
2346 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2348 lX->me8x8[i].cost = INT_MAX;
2349 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2351 m.i_ref_cost = REF_COST( l, i_ref );
2353 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2355 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2356 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2357 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2358 m.cost += m.i_ref_cost;
2360 if( m.cost < lX->me8x8[i].cost )
2362 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2363 a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2366 /* save mv for predicting other partitions within this MB */
2367 CP32( lX->mvc[i_ref][i+1], m.mv );
2372 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2373 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
2374 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2375 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
2376 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2377 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2379 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2380 i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2381 + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2382 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2384 if( h->mb.b_chroma_me )
2386 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2387 i_part_cost_bi += i_chroma_cost;
2388 a->i_satd8x8[2][i] += i_chroma_cost;
2391 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2392 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2394 i_part_cost = a->l0.me8x8[i].cost;
2395 h->mb.i_sub_partition[i] = D_L0_8x8;
2396 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2397 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2398 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2399 a->i_cost8x8bi += i_part_cost;
2401 /* XXX Needed for x264_mb_predict_mv */
2402 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2406 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2409 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2412 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2413 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2414 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2416 /* XXX Needed for x264_mb_predict_mv */
2417 h->mb.i_partition = D_8x8;
2421 for( int i = 0; i < 4; i++ )
2426 int i_part_cost_bi = 0;
2427 intptr_t stride[2] = {8,8};
2430 for( int l = 0; l < 2; l++ )
2432 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2433 x264_me_t *m = &lX->me8x8[i];
2434 m->i_pixel = PIXEL_8x8;
2435 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2437 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2438 m->i_ref = lX->me16x16.i_ref;
2440 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2442 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2443 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2444 x264_me_search( h, m, &lX->me16x16.mv, 1 );
2445 a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2446 m->cost += m->i_ref_cost;
2448 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2450 /* save mv for predicting other partitions within this MB */
2451 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2454 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2455 m->mv[0], m->mv[1], 8, 8, x264_weight_none );
2456 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2458 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2459 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2460 i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2461 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2462 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2464 if( h->mb.b_chroma_me )
2466 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2467 i_part_cost_bi += i_chroma_cost;
2468 a->i_satd8x8[2][i] += i_chroma_cost;
2471 i_part_cost = a->l0.me8x8[i].cost;
2472 h->mb.i_sub_partition[i] = D_L0_8x8;
2473 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2474 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2475 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2476 a->i_cost8x8bi += i_part_cost;
2478 /* XXX Needed for x264_mb_predict_mv */
2479 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2483 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2486 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2488 ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
2489 ALIGNED_4( int16_t mvc[3][2] );
2491 h->mb.i_partition = D_16x8;
2492 a->i_cost16x8bi = 0;
2494 for( int i = 0; i < 2; i++ )
2497 int i_part_cost_bi = 0;
2498 intptr_t stride[2] = {16,16};
2501 m.i_pixel = PIXEL_16x8;
2502 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2504 for( int l = 0; l < 2; l++ )
2506 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2507 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2508 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2509 lX->me16x8[i].cost = INT_MAX;
2510 for( int j = 0; j < i_ref8s; j++ )
2512 int i_ref = ref8[j];
2513 m.i_ref_cost = REF_COST( l, i_ref );
2515 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2517 CP32( mvc[0], lX->mvc[i_ref][0] );
2518 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2519 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2521 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2522 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2523 x264_me_search( h, &m, mvc, 3 );
2524 m.cost += m.i_ref_cost;
2526 if( m.cost < lX->me16x8[i].cost )
2527 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2532 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2533 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
2534 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2535 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
2536 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2537 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2539 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2540 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2541 + a->l1.me16x8[i].i_ref_cost;
2543 if( h->mb.b_chroma_me )
2544 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2546 i_part_cost = a->l0.me16x8[i].cost;
2547 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2549 if( a->l1.me16x8[i].cost < i_part_cost )
2551 i_part_cost = a->l1.me16x8[i].cost;
2552 a->i_mb_partition16x8[i] = D_L1_8x8;
2554 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2556 i_part_cost = i_part_cost_bi;
2557 a->i_mb_partition16x8[i] = D_BI_8x8;
2559 a->i_cost16x8bi += i_part_cost;
2561 /* Early termination based on the current SATD score of partition[0]
2562 plus the estimated SATD score of partition[1] */
2563 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2564 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2566 a->i_cost16x8bi = COST_MAX;
2570 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2574 a->i_mb_type16x8 = B_L0_L0
2575 + (a->i_mb_partition16x8[0]>>2) * 3
2576 + (a->i_mb_partition16x8[1]>>2);
2577 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2580 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2582 ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2583 ALIGNED_4( int16_t mvc[3][2] );
2585 h->mb.i_partition = D_8x16;
2586 a->i_cost8x16bi = 0;
2588 for( int i = 0; i < 2; i++ )
2591 int i_part_cost_bi = 0;
2592 intptr_t stride[2] = {8,8};
2595 m.i_pixel = PIXEL_8x16;
2596 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2598 for( int l = 0; l < 2; l++ )
2600 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2601 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2602 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2603 lX->me8x16[i].cost = INT_MAX;
2604 for( int j = 0; j < i_ref8s; j++ )
2606 int i_ref = ref8[j];
2607 m.i_ref_cost = REF_COST( l, i_ref );
2609 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2611 CP32( mvc[0], lX->mvc[i_ref][0] );
2612 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2613 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2615 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2616 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2617 x264_me_search( h, &m, mvc, 3 );
2618 m.cost += m.i_ref_cost;
2620 if( m.cost < lX->me8x16[i].cost )
2621 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2626 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2627 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
2628 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2629 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
2630 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2632 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2633 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2634 + a->l1.me8x16[i].i_ref_cost;
2636 if( h->mb.b_chroma_me )
2637 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2639 i_part_cost = a->l0.me8x16[i].cost;
2640 a->i_mb_partition8x16[i] = D_L0_8x8;
2642 if( a->l1.me8x16[i].cost < i_part_cost )
2644 i_part_cost = a->l1.me8x16[i].cost;
2645 a->i_mb_partition8x16[i] = D_L1_8x8;
2647 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2649 i_part_cost = i_part_cost_bi;
2650 a->i_mb_partition8x16[i] = D_BI_8x8;
2652 a->i_cost8x16bi += i_part_cost;
2654 /* Early termination based on the current SATD score of partition[0]
2655 plus the estimated SATD score of partition[1] */
2656 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2657 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2659 a->i_cost8x16bi = COST_MAX;
2663 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2667 a->i_mb_type8x16 = B_L0_L0
2668 + (a->i_mb_partition8x16[0]>>2) * 3
2669 + (a->i_mb_partition8x16[1]>>2);
2670 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2673 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2675 int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
2677 h->mb.i_type = P_L0;
2678 if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
2680 h->mb.i_partition = D_16x16;
2681 x264_analyse_update_cache( h, a );
2682 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2685 if( a->l0.i_cost16x8 < thresh )
2687 h->mb.i_partition = D_16x8;
2688 x264_analyse_update_cache( h, a );
2689 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2692 a->l0.i_cost16x8 = COST_MAX;
2694 if( a->l0.i_cost8x16 < thresh )
2696 h->mb.i_partition = D_8x16;
2697 x264_analyse_update_cache( h, a );
2698 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2701 a->l0.i_cost8x16 = COST_MAX;
2703 if( a->l0.i_cost8x8 < thresh )
2705 h->mb.i_type = P_8x8;
2706 h->mb.i_partition = D_8x8;
2707 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2709 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2710 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2711 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2712 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2713 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2714 * for future blocks are those left over from previous RDO calls. */
2715 for( int i = 0; i < 4; i++ )
2717 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2718 int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
2719 int subtype, btype = D_L0_8x8;
2720 uint64_t bcost = COST_MAX64;
2721 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2724 if( costs[subtype] > sub8x8_thresh )
2726 h->mb.i_sub_partition[i] = subtype;
2727 x264_mb_cache_mv_p8x8( h, a, i );
2728 if( subtype == btype )
2730 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2731 COPY2_IF_LT( bcost, cost, btype, subtype );
2733 if( h->mb.i_sub_partition[i] != btype )
2735 h->mb.i_sub_partition[i] = btype;
2736 x264_mb_cache_mv_p8x8( h, a, i );
2741 x264_analyse_update_cache( h, a );
2742 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2745 a->l0.i_cost8x8 = COST_MAX;
2748 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2750 int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
2752 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2754 h->mb.i_type = B_DIRECT;
2755 /* Assumes direct/skip MC is still in fdec */
2756 /* Requires b-rdo to be done before intra analysis */
2757 h->mb.b_skip_mc = 1;
2758 x264_analyse_update_cache( h, a );
2759 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2760 h->mb.b_skip_mc = 0;
2763 //FIXME not all the update_cache calls are needed
2764 h->mb.i_partition = D_16x16;
2766 if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
2768 h->mb.i_type = B_L0_L0;
2769 x264_analyse_update_cache( h, a );
2770 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2774 if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
2776 h->mb.i_type = B_L1_L1;
2777 x264_analyse_update_cache( h, a );
2778 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2782 if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
2784 h->mb.i_type = B_BI_BI;
2785 x264_analyse_update_cache( h, a );
2786 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2790 if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
2792 h->mb.i_type = B_8x8;
2793 h->mb.i_partition = D_8x8;
2794 x264_analyse_update_cache( h, a );
2795 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2796 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2800 if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
2802 h->mb.i_type = a->i_mb_type16x8;
2803 h->mb.i_partition = D_16x8;
2804 x264_analyse_update_cache( h, a );
2805 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2809 if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
2811 h->mb.i_type = a->i_mb_type8x16;
2812 h->mb.i_partition = D_8x16;
2813 x264_analyse_update_cache( h, a );
2814 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2818 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2822 if( IS_INTRA(h->mb.i_type) )
2825 switch( h->mb.i_partition )
2828 if( h->mb.i_type == B_BI_BI )
2830 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2831 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2835 for( int i = 0; i < 2; i++ )
2836 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2838 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2839 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2843 for( int i = 0; i < 2; i++ )
2844 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2846 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2847 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2851 for( int i = 0; i < 4; i++ )
2852 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2854 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2855 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2861 static inline void x264_mb_analyse_transform( x264_t *h )
2863 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2865 /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
2868 int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
2869 int i_cost8 = 0, i_cost4 = 0;
2870 /* Not all platforms have a merged SATD function */
2871 if( h->pixf.sa8d_satd[PIXEL_16x16] )
2874 for( int p = 0; p < plane_count; p++ )
2876 cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2877 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2880 i_cost8 = (uint32_t)cost;
2881 i_cost4 = (uint32_t)(cost >> 32);
2885 for( int p = 0; p < plane_count; p++ )
2887 i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2888 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2889 i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2890 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2894 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2895 h->mb.b_skip_mc = 1;
2899 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2901 if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
2903 uint32_t subpart_bak = M32( h->mb.i_sub_partition );
2904 /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
2905 if( h->mb.i_type == P_8x8 )
2906 M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
2907 else if( !x264_transform_allowed[h->mb.i_type] )
2910 x264_analyse_update_cache( h, a );
2911 h->mb.b_transform_8x8 ^= 1;
2912 /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
2913 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2915 if( *i_rd >= i_rd8 )
2918 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2923 h->mb.b_transform_8x8 ^= 1;
2924 M32( h->mb.i_sub_partition ) = subpart_bak;
2929 /* Rate-distortion optimal QP selection.
2930 * FIXME: More than half of the benefit of this function seems to be
2931 * in the way it improves the coding of chroma DC (by decimating or
2932 * finding a better way to code a single DC coefficient.)
2933 * There must be a more efficient way to get that portion of the benefit
2934 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2936 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2938 int bcost, cost, failures, prevcost, origcost;
2939 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2940 int last_qp_tried = 0;
2941 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2942 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2944 /* If CBP is already zero, don't raise the quantizer any higher. */
2945 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2947 /* Without psy-RD, require monotonicity when moving quant away from previous
2948 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2949 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2950 * allow 2 failures when moving quant towards previous quant.
2951 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2952 int threshold = (!!h->mb.i_psy_rd);
2953 /* Raise the threshold for failures if we're moving towards the last QP. */
2954 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2955 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2957 h->mb.i_qp = orig_qp;
2959 prevcost = origcost;
2961 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2962 * (up to a point) will too. So, jump down to where the threshold will kick in
2963 * and check the QP there. If the CBP is still empty, skip the main loop.
2964 * If it isn't empty, we would have ended up having to check this QP anyways,
2965 * so as long as we store it for later lookup, we lose nothing. */
2966 int already_checked_qp = -1;
2967 int already_checked_cost = COST_MAX;
2968 if( direction == -1 )
2972 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
2973 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2974 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2975 if( !h->mb.cbp[h->mb.i_mb_xy] )
2977 /* If our empty-CBP block is lower QP than the last QP,
2978 * the last QP almost surely doesn't have a CBP either. */
2979 if( h->mb.i_last_qp > h->mb.i_qp )
2983 already_checked_qp = h->mb.i_qp;
2984 h->mb.i_qp = orig_qp;
2988 h->mb.i_qp += direction;
2989 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2991 if( h->mb.i_last_qp == h->mb.i_qp )
2993 if( h->mb.i_qp == already_checked_qp )
2994 cost = already_checked_cost;
2997 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2998 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2999 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3002 /* We can't assume that the costs are monotonic over QPs.
3003 * Tie case-as-failure seems to give better results. */
3004 if( cost < prevcost )
3010 if( failures > threshold )
3012 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
3014 h->mb.i_qp += direction;
3018 /* Always try the last block's QP. */
3019 if( !last_qp_tried )
3021 h->mb.i_qp = h->mb.i_last_qp;
3022 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3023 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3024 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3028 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3030 /* Check transform again; decision from before may no longer be optimal. */
3031 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
3032 x264_mb_transform_8x8_allowed( h ) )
3034 h->mb.b_transform_8x8 ^= 1;
3035 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3037 h->mb.b_transform_8x8 ^= 1;
3041 /*****************************************************************************
3042 * x264_macroblock_analyse:
3043 *****************************************************************************/
3044 void x264_macroblock_analyse( x264_t *h )
3046 x264_mb_analysis_t analysis;
3047 int i_cost = COST_MAX;
3049 h->mb.i_qp = x264_ratecontrol_mb_qp( h );
3050 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
3051 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
3052 if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
3053 h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
3055 if( h->param.analyse.b_mb_info )
3056 h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
3057 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
3059 /*--------------------------- Do the analysis ---------------------------*/
3060 if( h->sh.i_type == SLICE_TYPE_I )
3063 if( analysis.i_mbrd )
3064 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3065 x264_mb_analyse_intra( h, &analysis, COST_MAX );
3066 if( analysis.i_mbrd )
3067 x264_intra_rd( h, &analysis, COST_MAX );
3069 i_cost = analysis.i_satd_i16x16;
3070 h->mb.i_type = I_16x16;
3071 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
3072 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
3073 if( analysis.i_satd_pcm < i_cost )
3074 h->mb.i_type = I_PCM;
3076 else if( analysis.i_mbrd >= 2 )
3077 x264_intra_rd_refine( h, &analysis );
3079 else if( h->sh.i_type == SLICE_TYPE_P )
3083 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
3085 analysis.b_try_skip = 0;
3086 if( analysis.b_force_intra )
3088 if( !h->param.analyse.b_psy )
3090 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3091 goto intra_analysis;
3096 /* Special fast-skip logic using information from mb_info. */
3097 if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
3099 if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
3100 h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
3102 h->mb.i_partition = D_16x16;
3103 /* Use the P-SKIP MV if we can... */
3104 if( !M32(h->mb.cache.pskip_mv) )
3107 h->mb.i_type = P_SKIP;
3109 /* Otherwise, just force a 16x16 block. */
3112 h->mb.i_type = P_L0;
3113 analysis.l0.me16x16.i_ref = 0;
3114 M32( analysis.l0.me16x16.mv ) = 0;
3118 /* Reset the information accordingly */
3119 else if( h->param.analyse.b_mb_info_update )
3120 h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
3123 int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
3124 /* If the current macroblock is off the frame, just skip it. */
3125 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
3127 /* Fast P_SKIP detection */
3128 else if( h->param.analyse.b_fast_pskip )
3131 // FIXME don't need to check this if the reference frame is done
3133 else if( h->param.analyse.i_subpel_refine >= 3 )
3134 analysis.b_try_skip = 1;
3135 else if( h->mb.i_mb_type_left[0] == P_SKIP ||
3136 h->mb.i_mb_type_top == P_SKIP ||
3137 h->mb.i_mb_type_topleft == P_SKIP ||
3138 h->mb.i_mb_type_topright == P_SKIP )
3139 b_skip = x264_macroblock_probe_pskip( h );
3143 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
3147 h->mb.i_type = P_SKIP;
3148 h->mb.i_partition = D_16x16;
3149 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
3151 /* Set up MVs for future predictors */
3152 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3153 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3157 const unsigned int flags = h->param.analyse.inter;
3160 int i_satd_inter, i_satd_intra;
3162 x264_mb_analyse_load_costs( h, &analysis );
3164 x264_mb_analyse_inter_p16x16( h, &analysis );
3166 if( h->mb.i_type == P_SKIP )
3168 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3169 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3173 if( flags & X264_ANALYSE_PSUB16x16 )
3175 if( h->param.analyse.b_mixed_references )
3176 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
3178 x264_mb_analyse_inter_p8x8( h, &analysis );
3181 /* Select best inter mode */
3183 i_partition = D_16x16;
3184 i_cost = analysis.l0.me16x16.cost;
3186 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3187 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
3190 i_partition = D_8x8;
3191 i_cost = analysis.l0.i_cost8x8;
3194 if( flags & X264_ANALYSE_PSUB8x8 )
3196 for( int i = 0; i < 4; i++ )
3198 x264_mb_analyse_inter_p4x4( h, &analysis, i );
3199 int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
3200 if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
3202 int i_cost8x8 = analysis.l0.i_cost4x4[i];
3203 h->mb.i_sub_partition[i] = D_L0_4x4;
3205 x264_mb_analyse_inter_p8x4( h, &analysis, i );
3206 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
3207 h->mb.i_sub_partition[i], D_L0_8x4 );
3209 x264_mb_analyse_inter_p4x8( h, &analysis, i );
3210 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
3211 h->mb.i_sub_partition[i], D_L0_4x8 );
3213 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
3215 x264_mb_cache_mv_p8x8( h, &analysis, i );
3217 analysis.l0.i_cost8x8 = i_cost;
3221 /* Now do 16x8/8x16 */
3222 int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
3223 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3224 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
3226 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
3227 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3228 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3230 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
3231 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
3233 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
3234 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3235 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3237 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
3238 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
3241 h->mb.i_partition = i_partition;
3244 //FIXME mb_type costs?
3245 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3249 else if( i_partition == D_16x16 )
3251 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3252 i_cost = analysis.l0.me16x16.cost;
3254 else if( i_partition == D_16x8 )
3256 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
3257 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
3258 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
3260 else if( i_partition == D_8x16 )
3262 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
3263 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
3264 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
3266 else if( i_partition == D_8x8 )
3269 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3271 switch( h->mb.i_sub_partition[i8x8] )
3274 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3275 i_cost += analysis.l0.me8x8[i8x8].cost;
3278 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3279 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3280 i_cost += analysis.l0.me8x4[i8x8][0].cost +
3281 analysis.l0.me8x4[i8x8][1].cost;
3284 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3285 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3286 i_cost += analysis.l0.me4x8[i8x8][0].cost +
3287 analysis.l0.me4x8[i8x8][1].cost;
3291 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3292 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3293 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3294 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3295 i_cost += analysis.l0.me4x4[i8x8][0].cost +
3296 analysis.l0.me4x4[i8x8][1].cost +
3297 analysis.l0.me4x4[i8x8][2].cost +
3298 analysis.l0.me4x4[i8x8][3].cost;
3301 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3307 if( h->mb.b_chroma_me )
3311 x264_mb_analyse_intra( h, &analysis, i_cost );
3312 x264_mb_analyse_intra_chroma( h, &analysis );
3316 x264_mb_analyse_intra_chroma( h, &analysis );
3317 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
3319 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3320 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3321 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3324 x264_mb_analyse_intra( h, &analysis, i_cost );
3326 i_satd_inter = i_cost;
3327 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3328 analysis.i_satd_i8x8,
3329 analysis.i_satd_i4x4 );
3331 if( analysis.i_mbrd )
3333 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3335 i_partition = D_16x16;
3336 i_cost = analysis.l0.i_rd16x16;
3337 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3338 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3339 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3340 h->mb.i_type = i_type;
3341 h->mb.i_partition = i_partition;
3342 if( i_cost < COST_MAX )
3343 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3344 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
3347 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3348 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3349 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3350 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3352 h->mb.i_type = i_type;
3354 if( analysis.b_force_intra && !IS_INTRA(i_type) )
3356 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3357 * it was an inter block. */
3358 x264_analyse_update_cache( h, &analysis );
3359 x264_macroblock_encode( h );
3360 for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
3361 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
3364 int height = 16 >> CHROMA_V_SHIFT;
3365 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
3366 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
3368 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3369 goto intra_analysis;
3372 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3374 if( IS_INTRA( h->mb.i_type ) )
3376 x264_intra_rd_refine( h, &analysis );
3378 else if( i_partition == D_16x16 )
3380 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3381 analysis.l0.me16x16.cost = i_cost;
3382 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3384 else if( i_partition == D_16x8 )
3386 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3387 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3388 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3389 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3390 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3391 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3393 else if( i_partition == D_8x16 )
3395 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3396 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3397 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3398 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3399 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3400 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3402 else if( i_partition == D_8x8 )
3404 x264_analyse_update_cache( h, &analysis );
3405 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3407 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3409 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3411 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3413 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3414 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3416 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3418 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3419 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3421 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3423 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3424 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3425 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3426 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3433 else if( h->sh.i_type == SLICE_TYPE_B )
3435 int i_bskip_cost = COST_MAX;
3438 if( analysis.i_mbrd )
3439 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3441 h->mb.i_type = B_SKIP;
3442 if( h->mb.b_direct_auto_write )
3444 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3445 for( int i = 0; i < 2; i++ )
3448 h->sh.b_direct_spatial_mv_pred ^= 1;
3449 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3450 if( analysis.b_direct_available )
3455 b_skip = x264_macroblock_probe_bskip( h );
3457 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3464 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3466 analysis.b_try_skip = 0;
3467 if( analysis.b_direct_available )
3469 if( !h->mb.b_direct_auto_write )
3471 /* If the current macroblock is off the frame, just skip it. */
3472 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
3474 else if( analysis.i_mbrd )
3476 i_bskip_cost = ssd_mb( h );
3477 /* 6 = minimum cavlc cost of a non-skipped MB */
3478 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3480 else if( !h->mb.b_direct_auto_write )
3482 /* Conditioning the probe on neighboring block types
3483 * doesn't seem to help speed or quality. */
3484 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3485 if( h->param.analyse.i_subpel_refine < 3 )
3486 b_skip = analysis.b_try_skip;
3488 /* Set up MVs for future predictors */
3491 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3492 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3493 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3494 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3500 const unsigned int flags = h->param.analyse.inter;
3504 h->mb.b_skip_mc = 0;
3505 h->mb.i_type = B_DIRECT;
3507 x264_mb_analyse_load_costs( h, &analysis );
3509 /* select best inter mode */
3510 /* direct must be first */
3511 if( analysis.b_direct_available )
3512 x264_mb_analyse_inter_direct( h, &analysis );
3514 x264_mb_analyse_inter_b16x16( h, &analysis );
3516 if( h->mb.i_type == B_SKIP )
3518 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3519 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3520 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3521 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3526 i_partition = D_16x16;
3527 i_cost = analysis.l0.me16x16.cost;
3528 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3529 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3530 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3532 if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
3534 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3535 if( i_bskip_cost < analysis.i_rd16x16direct &&
3536 i_bskip_cost < analysis.i_rd16x16bi &&
3537 i_bskip_cost < analysis.l0.i_rd16x16 &&
3538 i_bskip_cost < analysis.l1.i_rd16x16 )
3540 h->mb.i_type = B_SKIP;
3541 x264_analyse_update_cache( h, &analysis );
3546 if( flags & X264_ANALYSE_BSUB16x16 )
3548 if( h->param.analyse.b_mixed_references )
3549 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3551 x264_mb_analyse_inter_b8x8( h, &analysis );
3553 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3555 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3556 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3557 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3558 for( int i = 0; i < 2; i++ )
3560 int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3561 int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3563 i_best_cost = COST_MAX;
3564 i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3565 i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3566 i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3567 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3568 + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3569 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3570 + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3571 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3572 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3573 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3574 analysis.i_cost_est16x8[i] = i_best_cost;
3577 i_best_cost = COST_MAX;
3578 i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3579 i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3580 i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3581 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3582 + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3583 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3584 + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3585 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3586 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3587 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3588 analysis.i_cost_est8x16[i] = i_best_cost;
3590 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3591 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3592 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3593 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3594 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3595 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3597 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3598 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3599 if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3601 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3602 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3604 if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
3606 x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3607 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3609 if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3611 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3612 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3616 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3621 else if( i_partition == D_16x16 )
3623 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3624 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3625 if( i_type == B_L0_L0 )
3627 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3628 i_cost = analysis.l0.me16x16.cost
3629 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3631 else if( i_type == B_L1_L1 )
3633 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3634 i_cost = analysis.l1.me16x16.cost
3635 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3637 else if( i_type == B_BI_BI )
3639 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3640 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3643 else if( i_partition == D_16x8 )
3645 for( int i = 0; i < 2; i++ )
3647 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3648 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3649 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3650 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3653 else if( i_partition == D_8x16 )
3655 for( int i = 0; i < 2; i++ )
3657 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3658 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3659 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3660 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3663 else if( i_partition == D_8x8 )
3665 for( int i = 0; i < 4; i++ )
3668 int i_part_cost_old;
3670 int i_part_type = h->mb.i_sub_partition[i];
3671 int b_bidir = (i_part_type == D_BI_8x8);
3673 if( i_part_type == D_DIRECT_8x8 )
3675 if( x264_mb_partition_listX_table[0][i_part_type] )
3677 m = &analysis.l0.me8x8[i];
3678 i_part_cost_old = m->cost;
3679 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3680 m->cost -= i_type_cost;
3681 x264_me_refine_qpel( h, m );
3683 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3685 if( x264_mb_partition_listX_table[1][i_part_type] )
3687 m = &analysis.l1.me8x8[i];
3688 i_part_cost_old = m->cost;
3689 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3690 m->cost -= i_type_cost;
3691 x264_me_refine_qpel( h, m );
3693 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3695 /* TODO: update mvp? */
3699 i_satd_inter = i_cost;
3701 if( analysis.i_mbrd )
3703 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3705 i_cost = i_bskip_cost;
3706 i_partition = D_16x16;
3707 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3708 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3709 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3710 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3711 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3712 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3713 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3715 h->mb.i_type = i_type;
3716 h->mb.i_partition = i_partition;
3719 if( h->mb.b_chroma_me )
3723 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3724 x264_mb_analyse_intra_chroma( h, &analysis );
3728 x264_mb_analyse_intra_chroma( h, &analysis );
3729 x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
3731 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3732 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3733 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3736 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3738 if( analysis.i_mbrd )
3740 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3741 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
3744 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3745 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3746 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3747 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3749 h->mb.i_type = i_type;
3750 h->mb.i_partition = i_partition;
3752 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3753 x264_intra_rd_refine( h, &analysis );
3754 if( h->mb.i_subpel_refine >= 5 )
3755 x264_refine_bidir( h, &analysis );
3757 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3760 x264_analyse_update_cache( h, &analysis );
3762 if( i_partition == D_16x16 )
3764 if( i_type == B_L0_L0 )
3766 analysis.l0.me16x16.cost = i_cost;
3767 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3769 else if( i_type == B_L1_L1 )
3771 analysis.l1.me16x16.cost = i_cost;
3772 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3774 else if( i_type == B_BI_BI )
3776 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3777 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3780 else if( i_partition == D_16x8 )
3782 for( int i = 0; i < 2; i++ )
3784 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3785 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3786 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3787 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3788 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3789 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3791 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3792 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3796 else if( i_partition == D_8x16 )
3798 for( int i = 0; i < 2; i++ )
3800 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3801 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3802 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3803 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3804 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3805 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3807 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3808 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3812 else if( i_partition == D_8x8 )
3814 for( int i = 0; i < 4; i++ )
3816 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3817 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3818 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3819 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3820 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3822 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3823 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3831 x264_analyse_update_cache( h, &analysis );
3833 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3834 * without realizing it. Check for this and account for it if necessary. */
3835 if( analysis.i_mbrd >= 2 )
3837 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3838 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3839 int list = check_mv_lists[h->mb.i_type] - 1;
3840 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3841 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3842 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3843 h->mb.i_partition = D_16x16;
3846 if( !analysis.i_mbrd )
3847 x264_mb_analyse_transform( h );
3849 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3850 x264_mb_analyse_qp_rd( h, &analysis );
3852 h->mb.b_trellis = h->param.analyse.i_trellis;
3853 h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3855 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3856 x264_psy_trellis_init( h, 0 );
3857 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3858 h->mb.i_skip_intra = 0;
3861 /*-------------------- Update MB from the analysis ----------------------*/
3862 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3864 switch( h->mb.i_type )
3867 for( int i = 0; i < 16; i++ )
3868 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3870 x264_mb_analyse_intra_chroma( h, a );
3873 for( int i = 0; i < 4; i++ )
3874 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3876 x264_mb_analyse_intra_chroma( h, a );
3879 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3880 x264_mb_analyse_intra_chroma( h, a );
3887 switch( h->mb.i_partition )
3890 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3891 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3895 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3896 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3897 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3898 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3902 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3903 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3904 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3905 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3909 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3915 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3916 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3917 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3918 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3919 for( int i = 0; i < 4; i++ )
3920 x264_mb_cache_mv_p8x8( h, a, i );
3925 h->mb.i_partition = D_16x16;
3926 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3927 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3933 h->mb.i_partition = h->mb.cache.direct_partition;
3934 x264_mb_load_mv_direct8x8( h, 0 );
3935 x264_mb_load_mv_direct8x8( h, 1 );
3936 x264_mb_load_mv_direct8x8( h, 2 );
3937 x264_mb_load_mv_direct8x8( h, 3 );
3941 /* optimize: cache might not need to be rewritten */
3942 for( int i = 0; i < 4; i++ )
3943 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3946 default: /* the rest of the B types */
3947 switch( h->mb.i_partition )
3950 switch( h->mb.i_type )
3953 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3954 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3956 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3957 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3958 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3961 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3962 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3963 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3965 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3966 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3969 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3970 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3972 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3973 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3978 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3979 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3982 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3983 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3986 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3992 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3994 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3997 int ref = h->mb.cache.ref[l][x264_scan8[0]];
4000 completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
4001 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
4003 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
4004 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
4005 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
4006 h->mb.cache.mv[l][x264_scan8[15]][0],
4007 h->mb.cache.mv[l][x264_scan8[15]][1] );
4008 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
4009 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
4010 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
4011 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
4012 x264_mb_analyse_intra( h, a, COST_MAX );
4013 h->mb.i_type = I_16x16;
4014 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
4015 x264_mb_analyse_intra_chroma( h, a );
4022 #include "slicetype.c"