1 /*****************************************************************************
2 * analyse.c: macroblock analysis
3 *****************************************************************************
4 * Copyright (C) 2003-2016 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
28 #include "common/common.h"
29 #include "macroblock.h"
31 #include "ratecontrol.h"
40 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
44 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
45 ALIGNED_4( int16_t mvc[32][5][2] );
49 int i_cost4x4[4]; /* cost per 8x8 partition */
50 x264_me_t me4x4[4][4];
53 int i_cost8x4[4]; /* cost per 8x8 partition */
54 x264_me_t me8x4[4][2];
57 int i_cost4x8[4]; /* cost per 8x8 partition */
58 x264_me_t me4x8[4][2];
68 } x264_mb_analysis_list_t;
72 /* conduct the analysis using this lamda and QP */
77 uint16_t *p_cost_ref[2];
82 /* Take some shortcuts in intra search if intra is deemed unlikely */
84 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
85 int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
90 int i_satd_i16x16_dir[7];
95 ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
105 int i_satd_chroma_dir[7];
106 int i_predict8x8chroma;
108 /* II: Inter part P/B frame */
109 x264_mb_analysis_list_t l0;
110 x264_mb_analysis_list_t l1;
112 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
113 int i_cost16x16direct;
115 int i_cost8x8direct[4];
116 int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
117 int i_cost_est16x8[2]; /* Per-partition estimated cost */
118 int i_cost_est8x16[2];
127 int i_mb_partition16x8[2]; /* mb_partition_e */
128 int i_mb_partition8x16[2];
129 int i_mb_type16x8; /* mb_class_e */
132 int b_direct_available;
133 int b_early_terminate;
135 } x264_mb_analysis_t;
137 /* lambda = pow(2,qp/6-2) */
138 const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
140 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */
141 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */
142 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */
143 4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */
144 10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */
145 25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */
146 64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */
147 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
148 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
149 1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
150 2048,2299, /* 80-81 */
153 /* lambda2 = pow(lambda,2) * .9 * 256 */
154 /* Capped to avoid overflow */
155 const int x264_lambda2_tab[QP_MAX_MAX+1] =
157 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */
158 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */
159 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */
160 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */
161 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */
162 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */
163 943718, 1189010, 1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
164 5992238, 7549747, 9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
165 38048341, 47937906, 60397977, 76096683, 95875813,120795955, /* 64-69 */
166 134217727,134217727,134217727,134217727,134217727,134217727, /* 70-75 */
167 134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */
170 const uint8_t x264_exp2_lut[64] =
172 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
173 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
174 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
175 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
178 const float x264_log2_lut[128] =
180 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
181 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
182 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
183 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
184 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
185 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
186 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
187 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
188 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
189 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
190 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
191 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
192 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
193 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
194 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
195 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
198 /* Avoid an int/float conversion. */
199 const float x264_log2_lz_lut[32] =
201 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
204 // should the intra and inter lambdas be different?
205 // I'm just matching the behaviour of deadzone quant.
206 static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
208 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
210 46, 58, 73, 92, 117, 147,
211 185, 233, 294, 370, 466, 587,
212 740, 932, 1174, 1480, 1864, 2349,
213 2959, 3728, 4697, 5918, 7457, 9395,
214 11837, 14914, 18790, 23674, 29828, 37581,
215 47349, 59656, 75163, 94699, 119313, 150326,
216 189399, 238627, 300652, 378798, 477255, 601304,
217 757596, 954511, 1202608, 1515192, 1909022, 2405217,
218 3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
219 12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
220 48486154, 61088726, 76966972, 96972308,
221 122177453,134217727,134217727,134217727,134217727,134217727,
222 134217727,134217727,134217727,134217727,134217727,134217727,
224 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
226 27, 34, 43, 54, 68, 86,
227 108, 136, 172, 216, 273, 343,
228 433, 545, 687, 865, 1090, 1374,
229 1731, 2180, 2747, 3461, 4361, 5494,
230 6922, 8721, 10988, 13844, 17442, 21976,
231 27688, 34885, 43953, 55377, 69771, 87906,
232 110755, 139543, 175813, 221511, 279087, 351627,
233 443023, 558174, 703255, 886046, 1116348, 1406511,
234 1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
235 7088374, 8930791, 11252092, 14176748, 17861583, 22504184,
236 28353495, 35723165, 45008368, 56706990,
237 71446330, 90016736,113413980,134217727,134217727,134217727,
238 134217727,134217727,134217727,134217727,134217727,134217727,
239 134217727,134217727,134217727,134217727,134217727,134217727,
243 #define MAX_CHROMA_LAMBDA_OFFSET 36
244 static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
246 16, 20, 25, 32, 40, 50,
247 64, 80, 101, 128, 161, 203,
248 256, 322, 406, 512, 645, 812,
249 1024, 1290, 1625, 2048, 2580, 3250,
250 4096, 5160, 6501, 8192, 10321, 13003,
251 16384, 20642, 26007, 32768, 41285, 52015,
255 /* TODO: calculate CABAC costs */
256 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
258 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
260 static const uint8_t i_mb_b16x8_cost_table[17] =
262 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
264 static const uint8_t i_sub_mb_b_cost_table[13] =
266 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
268 static const uint8_t i_sub_mb_p_cost_table[4] =
273 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
275 static uint16_t x264_cost_ref[QP_MAX+1][3][33];
276 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
277 static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
279 static int init_costs( x264_t *h, float *logs, int qp )
281 int lambda = x264_lambda_tab[qp];
284 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
285 CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
286 h->cost_mv[qp] += 2*4*2048;
287 for( int i = 0; i <= 2*4*2048; i++ )
290 h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
292 x264_pthread_mutex_lock( &cost_ref_mutex );
293 for( int i = 0; i < 3; i++ )
294 for( int j = 0; j < 33; j++ )
295 x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
296 x264_pthread_mutex_unlock( &cost_ref_mutex );
297 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
299 for( int j = 0; j < 4; j++ )
301 CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
302 h->cost_mv_fpel[qp][j] += 2*2048;
303 for( int i = -2*2048; i < 2*2048; i++ )
304 h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
307 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
308 for( int i = 0; i < 17; i++ )
309 cost_i4x4_mode[i] = 3*lambda*(i!=8);
315 int x264_analyse_init_costs( x264_t *h )
317 float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
322 for( int i = 1; i <= 2*4*2048; i++ )
323 logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
325 for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
326 if( init_costs( h, logs, qp ) )
329 if( init_costs( h, logs, X264_LOOKAHEAD_QP ) )
339 void x264_analyse_free_costs( x264_t *h )
341 for( int i = 0; i < QP_MAX+1; i++ )
344 x264_free( h->cost_mv[i] - 2*4*2048 );
345 if( h->cost_mv_fpel[i][0] )
346 for( int j = 0; j < 4; j++ )
347 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
351 void x264_analyse_weight_frame( x264_t *h, int end )
353 for( int j = 0; j < h->i_ref[0]; j++ )
355 if( h->sh.weight[j][0].weightfn )
357 x264_frame_t *frame = h->fref[0][j];
358 int width = frame->i_width[0] + 2*PADH;
359 int i_padv = PADV << PARAM_INTERLACED;
361 pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
362 height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
363 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
364 h->fenc->i_lines_weighted += height;
366 for( int k = j; k < h->i_ref[0]; k++ )
367 if( h->sh.weight[k][0].weightfn )
369 pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
370 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
371 src + offset, frame->i_stride[0],
372 width, height, &h->sh.weight[k][0] );
379 /* initialize an array of lambda*nbits for all possible mvs */
380 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
382 a->p_cost_mv = h->cost_mv[a->i_qp];
383 a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
384 a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
387 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
389 int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
390 a->i_lambda = x264_lambda_tab[qp];
391 a->i_lambda2 = x264_lambda2_tab[qp];
393 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
394 if( h->param.analyse.i_trellis )
396 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
397 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
398 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
399 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
401 h->mb.i_psy_rd_lambda = a->i_lambda;
402 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
403 int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
404 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
406 if( qp > QP_MAX_SPEC )
408 h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
409 h->nr_residual_sum = h->nr_residual_sum_buf[1];
410 h->nr_count = h->nr_count_buf[1];
411 h->mb.b_noise_reduction = 1;
412 qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
416 h->nr_offset = h->nr_offset_denoise;
417 h->nr_residual_sum = h->nr_residual_sum_buf[0];
418 h->nr_count = h->nr_count_buf[0];
419 h->mb.b_noise_reduction = 0;
422 a->i_qp = h->mb.i_qp = qp;
423 h->mb.i_chroma_qp = h->chroma_qp_table[qp];
426 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
428 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
430 /* mbrd == 1 -> RD mode decision */
431 /* mbrd == 2 -> RD refinement */
432 /* mbrd == 3 -> QPRD */
433 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
434 h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
435 a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
437 x264_mb_analyse_init_qp( h, a, qp );
439 h->mb.b_transform_8x8 = 0;
445 a->i_satd_chroma = COST_MAX;
447 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
448 * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
449 uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
450 a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
453 a->b_avoid_topright = 0;
455 h->mb.b_lossless ? 0 :
457 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
459 /* II: Inter part P/B frame */
460 if( h->sh.i_type != SLICE_TYPE_I )
462 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
463 // limit motion search to a slightly smaller range than the theoretical limit,
464 // since the search may go a few iterations past its given range
465 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
467 /* Calculate max allowed MV range */
468 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
469 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
470 h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
471 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
472 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
473 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
475 int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
476 int max_mv = max_x - 4*16*h->mb.i_mb_x;
477 /* If we're left of the refresh bar, don't reference right of it. */
478 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
479 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
481 h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
482 h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
483 if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
485 int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
486 int thread_mvy_range = i_fmv_range;
488 if( h->i_thread_frames > 1 )
490 int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
491 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
492 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
493 for( int j = 0; j < h->i_ref[i]; j++ )
495 x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
496 thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
499 if( h->param.b_deterministic )
500 thread_mvy_range = h->param.analyse.i_mv_range_thread;
501 if( PARAM_INTERLACED )
502 thread_mvy_range >>= 1;
504 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
507 if( PARAM_INTERLACED )
509 /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
510 for( int i = 0; i < 3; i++ )
513 mb_y = (h->mb.i_mb_y >> j) + (i == 1);
514 h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
515 h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
516 h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
517 h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
518 h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
519 h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
520 h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
525 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
526 h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
527 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
528 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
529 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
530 h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
531 h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
534 if( PARAM_INTERLACED )
536 int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
537 h->mb.mv_min[1] = h->mb.mv_miny_row[i];
538 h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
539 h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
540 h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
541 h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
542 h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
550 a->l0.i_cost8x16 = COST_MAX;
551 if( h->sh.i_type == SLICE_TYPE_B )
556 a->i_cost8x8direct[0] =
557 a->i_cost8x8direct[1] =
558 a->i_cost8x8direct[2] =
559 a->i_cost8x8direct[3] =
568 a->i_cost16x16direct =
571 a->i_cost8x16bi = COST_MAX;
573 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
574 for( int i = 0; i < 4; i++ )
578 a->l0.i_cost4x8[i] = COST_MAX;
581 /* Fast intra decision */
582 if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
584 /* Always run in fast-intra mode for subme < 3 */
585 if( h->mb.i_subpel_refine > 2 &&
586 ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
587 IS_INTRA( h->mb.i_mb_type_top ) ||
588 IS_INTRA( h->mb.i_mb_type_topleft ) ||
589 IS_INTRA( h->mb.i_mb_type_topright ) ||
590 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
591 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
592 { /* intra is likely */ }
599 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
600 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
602 a->b_force_intra = 1;
604 a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
607 a->b_force_intra = 0;
611 /* Prediction modes allowed for various combinations of neighbors. */
612 /* Terminated by a -1. */
613 /* In order, no neighbors, left, top, top/left, top/left/topleft */
614 static const int8_t i16x16_mode_available[5][5] =
616 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
617 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
618 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
619 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
620 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
623 static const int8_t chroma_mode_available[5][5] =
625 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
626 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
627 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
628 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
629 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
632 static const int8_t i8x8_mode_available[2][5][10] =
635 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
636 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
637 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
638 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
639 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
642 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
643 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
644 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
645 {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
646 {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
650 static const int8_t i4x4_mode_available[2][5][10] =
653 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
654 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
655 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
656 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
657 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
660 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
661 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
662 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
663 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
664 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
668 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
670 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
671 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
672 return i16x16_mode_available[idx];
675 static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
677 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
678 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
679 return chroma_mode_available[idx];
682 static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
684 int avoid_topright = force_intra && (i&1);
685 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
686 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
687 return i8x8_mode_available[avoid_topright][idx];
690 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
692 int avoid_topright = force_intra && ((i&5) == 5);
693 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
694 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
695 return i4x4_mode_available[avoid_topright][idx];
698 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
699 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
701 ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
703 if( do_both_dct || h->mb.b_transform_8x8 )
704 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
705 if( do_both_dct || !h->mb.b_transform_8x8 )
706 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
709 /* Reset fenc satd scores cache for psy RD */
710 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
712 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
713 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
714 if( !h->mb.i_psy_rd )
716 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
717 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
719 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
722 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
724 if( a->i_satd_chroma < COST_MAX )
729 if( !h->mb.b_chroma_me )
731 a->i_satd_chroma = 0;
735 /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
736 if( h->mb.b_lossless )
738 x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
739 x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
743 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
744 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
746 a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
747 + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
751 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
752 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
754 /* Prediction selection for chroma */
755 if( predict_mode[3] >= 0 && !h->mb.b_lossless )
757 int satdu[4], satdv[4];
758 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
759 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
760 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
761 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
762 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
763 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
765 for( ; *predict_mode >= 0; predict_mode++ )
767 int i_mode = *predict_mode;
768 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
770 a->i_satd_chroma_dir[i_mode] = i_satd;
771 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
776 for( ; *predict_mode >= 0; predict_mode++ )
779 int i_mode = *predict_mode;
781 /* we do the prediction */
782 if( h->mb.b_lossless )
783 x264_predict_lossless_chroma( h, i_mode );
786 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
787 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
790 /* we calculate the cost */
791 i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
792 h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
793 a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
795 a->i_satd_chroma_dir[i_mode] = i_satd;
796 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
800 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
803 /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
804 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
806 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
807 pixel *p_src = h->mb.pic.p_fenc[0];
808 pixel *p_dst = h->mb.pic.p_fdec[0];
809 static const int8_t intra_analysis_shortcut[2][2][2][5] =
811 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
812 {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
813 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
814 {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
815 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
816 {-1, -1, -1, -1, -1}},
817 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
818 {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
822 int lambda = a->i_lambda;
824 /*---------------- Try all mode and calculate their score ---------------*/
825 /* Disabled i16x16 for AVC-Intra compat */
826 if( !h->param.i_avcintra_class )
828 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
830 /* Not heavily tuned */
831 static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
832 int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
834 if( !h->mb.b_lossless && predict_mode[3] >= 0 )
836 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
837 a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
838 a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
839 a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
840 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
841 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
842 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
844 /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
845 if( a->i_satd_i16x16 <= i16x16_thresh )
847 h->predict_16x16[I_PRED_16x16_P]( p_dst );
848 a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
849 a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
850 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
855 for( ; *predict_mode >= 0; predict_mode++ )
858 int i_mode = *predict_mode;
860 if( h->mb.b_lossless )
861 x264_predict_lossless_16x16( h, 0, i_mode );
863 h->predict_16x16[i_mode]( p_dst );
865 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
866 lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
867 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
868 a->i_satd_i16x16_dir[i_mode] = i_satd;
872 if( h->sh.i_type == SLICE_TYPE_B )
873 /* cavlc mb type prefix */
874 a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
876 if( a->i_satd_i16x16 > i16x16_thresh )
880 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
881 /* 8x8 prediction selection */
882 if( flags & X264_ANALYSE_I8x8 )
884 ALIGNED_ARRAY_32( pixel, edge,[36] );
885 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
886 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
888 // FIXME some bias like in i4x4?
889 int i_cost = lambda * 4; /* base predmode costs */
890 h->mb.i_cbp_luma = 0;
892 if( h->sh.i_type == SLICE_TYPE_B )
893 i_cost += lambda * i_mb_b_cost_table[I_8x8];
895 for( idx = 0;; idx++ )
899 pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
900 pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
901 int i_best = COST_MAX;
902 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
904 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
905 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
907 if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
909 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
910 i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
911 i_cost += i_best & 0xffff;
913 a->i_predict8x8[idx] = i_best;
914 if( idx == 3 || i_cost > i_satd_thresh )
916 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
920 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
922 ALIGNED_ARRAY_16( int32_t, satd,[9] );
923 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
924 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
925 satd[i_pred_mode] -= 3 * lambda;
926 for( int i = 2; i >= 0; i-- )
929 a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
930 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
933 /* Take analysis shortcuts: don't analyse modes that are too
934 * far away direction-wise from the favored mode. */
935 if( a->i_mbrd < 1 + a->b_fast_intra )
936 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
941 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
944 int i_mode = *predict_mode;
946 if( h->mb.b_lossless )
947 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
949 h->predict_8x8[i_mode]( p_dst_by, edge );
951 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
952 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
953 i_satd -= 3 * lambda;
955 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
956 a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
958 i_cost += i_best + 3*lambda;
960 if( idx == 3 || i_cost > i_satd_thresh )
962 if( h->mb.b_lossless )
963 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
965 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
966 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
968 /* we need to encode this block now (for next ones) */
969 x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
974 a->i_satd_i8x8 = i_cost;
975 if( h->mb.i_skip_intra )
977 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
978 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
979 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
980 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
981 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
982 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
983 if( h->mb.i_skip_intra == 2 )
984 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
989 static const uint16_t cost_div_fix8[3] = {1024,512,341};
990 a->i_satd_i8x8 = COST_MAX;
991 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
993 /* Not heavily tuned */
994 static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
995 if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
999 /* 4x4 prediction selection */
1000 if( flags & X264_ANALYSE_I4x4 )
1002 int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
1003 int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
1004 h->mb.i_cbp_luma = 0;
1006 if( a->b_early_terminate && a->i_mbrd )
1007 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
1009 if( h->sh.i_type == SLICE_TYPE_B )
1010 i_cost += lambda * i_mb_b_cost_table[I_4x4];
1012 for( idx = 0;; idx++ )
1014 pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
1015 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1016 int i_best = COST_MAX;
1017 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
1019 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1021 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1022 /* emulate missing topright samples */
1023 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1025 if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
1027 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
1028 i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
1029 i_cost += i_best & 0xffff;
1031 a->i_predict4x4[idx] = i_best;
1032 if( i_cost > i_satd_thresh || idx == 15 )
1034 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
1038 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
1040 ALIGNED_ARRAY_16( int32_t, satd,[9] );
1041 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
1042 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
1043 satd[i_pred_mode] -= 3 * lambda;
1044 i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
1045 COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
1046 COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
1048 /* Take analysis shortcuts: don't analyse modes that are too
1049 * far away direction-wise from the favored mode. */
1050 if( a->i_mbrd < 1 + a->b_fast_intra )
1051 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
1058 for( ; *predict_mode >= 0; predict_mode++ )
1061 int i_mode = *predict_mode;
1063 if( h->mb.b_lossless )
1064 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
1066 h->predict_4x4[i_mode]( p_dst_by );
1068 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
1069 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
1071 i_satd -= lambda * 3;
1075 a->i_predict4x4[idx] = i_mode;
1080 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
1084 i_cost += i_best + 3 * lambda;
1085 if( i_cost > i_satd_thresh || idx == 15 )
1087 if( h->mb.b_lossless )
1088 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
1090 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
1091 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1093 /* we need to encode this block now (for next ones) */
1094 x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
1098 a->i_satd_i4x4 = i_cost;
1099 if( h->mb.i_skip_intra )
1101 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1102 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1103 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1104 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1105 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1106 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1107 if( h->mb.i_skip_intra == 2 )
1108 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1112 a->i_satd_i4x4 = COST_MAX;
1116 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1118 if( !a->b_early_terminate )
1119 i_satd_thresh = COST_MAX;
1121 if( a->i_satd_i16x16 < i_satd_thresh )
1123 h->mb.i_type = I_16x16;
1124 x264_analyse_update_cache( h, a );
1125 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1128 a->i_satd_i16x16 = COST_MAX;
1130 if( a->i_satd_i4x4 < i_satd_thresh )
1132 h->mb.i_type = I_4x4;
1133 x264_analyse_update_cache( h, a );
1134 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1137 a->i_satd_i4x4 = COST_MAX;
1139 if( a->i_satd_i8x8 < i_satd_thresh )
1141 h->mb.i_type = I_8x8;
1142 x264_analyse_update_cache( h, a );
1143 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1144 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1147 a->i_satd_i8x8 = COST_MAX;
1150 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1152 uint64_t i_satd, i_best;
1153 int plane_count = CHROMA444 ? 3 : 1;
1154 h->mb.i_skip_intra = 0;
1156 if( h->mb.i_type == I_16x16 )
1158 int old_pred_mode = a->i_predict16x16;
1159 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1160 int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
1161 i_best = a->i_satd_i16x16;
1162 for( ; *predict_mode >= 0; predict_mode++ )
1164 int i_mode = *predict_mode;
1165 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1167 h->mb.i_intra16x16_pred_mode = i_mode;
1168 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1169 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1173 /* RD selection for chroma prediction */
1176 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
1177 if( predict_mode[1] >= 0 )
1179 int8_t predict_mode_sorted[4];
1181 int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
1183 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1185 int i_mode = *predict_mode;
1186 if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1187 predict_mode_sorted[i_max++] = i_mode;
1192 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1193 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1194 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1195 * coefs for the current chroma mode are still around, so we only
1196 * have to recount the bits. */
1197 i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1198 for( int i = 0; i < i_max; i++ )
1200 int i_mode = predict_mode_sorted[i];
1201 if( h->mb.b_lossless )
1202 x264_predict_lossless_chroma( h, i_mode );
1205 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
1206 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
1208 /* if we've already found a mode that needs no residual, then
1209 * probably any mode with a residual will be worse.
1210 * so avoid dct on the remaining modes to improve speed. */
1211 i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1212 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1214 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1215 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1220 if( h->mb.i_type == I_4x4 )
1222 pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
1224 for( int idx = 0; idx < 16; idx++ )
1226 pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
1227 h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
1228 h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
1229 i_best = COST_MAX64;
1231 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1233 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1234 for( int p = 0; p < plane_count; p++ )
1235 /* emulate missing topright samples */
1236 MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
1238 for( ; *predict_mode >= 0; predict_mode++ )
1240 int i_mode = *predict_mode;
1241 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1243 if( i_best > i_satd )
1245 a->i_predict4x4[idx] = i_mode;
1247 for( int p = 0; p < plane_count; p++ )
1249 pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
1250 pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
1251 pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
1252 pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
1253 nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
1258 for( int p = 0; p < plane_count; p++ )
1260 MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
1261 MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
1262 MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
1263 MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
1264 h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
1267 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1270 else if( h->mb.i_type == I_8x8 )
1272 ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
1273 pixel4 pels_h[3][2] = {{0}};
1274 pixel pels_v[3][7] = {{0}};
1275 uint16_t nnz[3][2] = {{0}}; //shut up gcc
1276 for( int idx = 0; idx < 4; idx++ )
1280 int s8 = X264_SCAN8_0 + 2*x + 16*y;
1281 pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
1282 h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
1283 h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
1284 int cbp_luma_new = 0;
1285 int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
1287 i_best = COST_MAX64;
1289 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1290 for( int p = 0; p < plane_count; p++ )
1291 h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1293 for( ; *predict_mode >= 0; predict_mode++ )
1295 int i_mode = *predict_mode;
1296 if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
1299 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1300 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
1302 if( i_best > i_satd )
1304 a->i_predict8x8[idx] = i_mode;
1305 cbp_luma_new = h->mb.i_cbp_luma;
1308 for( int p = 0; p < plane_count; p++ )
1310 pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
1311 pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
1313 for( int j = 0; j < 7; j++ )
1314 pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
1315 nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
1316 nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
1320 a->i_cbp_i8x8_luma = cbp_luma_new;
1321 for( int p = 0; p < plane_count; p++ )
1323 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
1324 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
1326 for( int j = 0; j < 7; j++ )
1327 dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
1328 M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
1329 M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
1332 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1337 #define LOAD_FENC(m, src, xoff, yoff) \
1339 (m)->p_cost_mv = a->p_cost_mv; \
1340 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1341 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1342 (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
1343 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1344 (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1345 (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1348 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1350 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1351 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1352 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1353 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1356 (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
1357 (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
1358 (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
1359 (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
1360 (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
1361 (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
1362 (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
1363 (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
1366 (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
1367 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1368 (m)->weight = x264_weight_none; \
1372 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1373 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1374 (m)->weight = h->sh.weight[i_ref];
1376 #define REF_COST(list, ref) \
1377 (a->p_cost_ref[list][ref])
1379 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1383 ALIGNED_4( int16_t mvc[8][2] );
1384 int i_halfpel_thresh = INT_MAX;
1385 int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
1387 /* 16x16 Search on all ref frame */
1388 m.i_pixel = PIXEL_16x16;
1389 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1391 a->l0.me16x16.cost = INT_MAX;
1392 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1394 m.i_ref_cost = REF_COST( 0, i_ref );
1395 i_halfpel_thresh -= m.i_ref_cost;
1397 /* search with ref */
1398 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1399 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1401 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1403 if( h->mb.ref_blind_dupe == i_ref )
1405 CP32( m.mv, a->l0.mvc[0][0] );
1406 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1410 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1411 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1414 /* save mv for predicting neighbors */
1415 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1416 CP32( a->l0.mvc[i_ref][0], m.mv );
1418 /* early termination
1419 * SSD threshold would probably be better than SATD */
1422 && m.cost-m.cost_mv < 300*a->i_lambda
1423 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1424 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1425 && x264_macroblock_probe_pskip( h ) )
1427 h->mb.i_type = P_SKIP;
1428 x264_analyse_update_cache( h, a );
1429 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1433 m.cost += m.i_ref_cost;
1434 i_halfpel_thresh += m.i_ref_cost;
1436 if( m.cost < a->l0.me16x16.cost )
1437 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1440 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1441 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1443 h->mb.i_type = P_L0;
1446 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1447 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1449 h->mb.i_partition = D_16x16;
1450 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1451 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1452 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1453 h->mb.i_type = P_SKIP;
1458 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1461 pixel **p_fenc = h->mb.pic.p_fenc;
1462 int i_maxref = h->mb.pic.i_fref[0]-1;
1464 h->mb.i_partition = D_8x8;
1466 #define CHECK_NEIGHBOUR(i)\
1468 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1469 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1473 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1474 * than those used by the neighbors */
1475 if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1476 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
1479 CHECK_NEIGHBOUR( -8 - 1 );
1480 CHECK_NEIGHBOUR( -8 + 0 );
1481 CHECK_NEIGHBOUR( -8 + 2 );
1482 CHECK_NEIGHBOUR( -8 + 4 );
1483 CHECK_NEIGHBOUR( 0 - 1 );
1484 CHECK_NEIGHBOUR( 2*8 - 1 );
1486 #undef CHECK_NEIGHBOUR
1488 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1489 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1491 for( int i = 0; i < 4; i++ )
1493 x264_me_t *l0m = &a->l0.me8x8[i];
1497 m.i_pixel = PIXEL_8x8;
1499 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1500 l0m->cost = INT_MAX;
1501 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1503 m.i_ref_cost = REF_COST( 0, i_ref );
1505 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1506 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1508 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1509 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1510 if( h->mb.ref_blind_dupe == i_ref )
1512 CP32( m.mv, a->l0.mvc[0][i+1] );
1513 x264_me_refine_qpel_refdupe( h, &m, NULL );
1516 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1518 m.cost += m.i_ref_cost;
1520 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1522 if( m.cost < l0m->cost )
1523 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1524 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1525 i_ref = h->mb.ref_blind_dupe;
1529 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1530 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1532 a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1534 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1535 are effectively zero. */
1536 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1537 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1540 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1541 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1542 /* P_8x8 ref0 has no ref cost */
1543 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1544 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1545 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1546 M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
1549 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1551 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1552 * reference frame flags. Thus, if we're not doing mixedrefs, just
1553 * don't bother analysing the dupes. */
1554 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1555 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1556 pixel **p_fenc = h->mb.pic.p_fenc;
1558 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1560 /* XXX Needed for x264_mb_predict_mv */
1561 h->mb.i_partition = D_8x8;
1564 CP32( mvc[0], a->l0.me16x16.mv );
1566 for( int i = 0; i < 4; i++ )
1568 x264_me_t *m = &a->l0.me8x8[i];
1572 m->i_pixel = PIXEL_8x8;
1573 m->i_ref_cost = i_ref_cost;
1575 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1576 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1577 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1579 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1580 x264_me_search( h, m, mvc, i_mvc );
1582 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1584 CP32( mvc[i_mvc], m->mv );
1587 a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1590 m->cost += i_ref_cost;
1591 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1592 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1595 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1596 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1597 /* theoretically this should include 4*ref_cost,
1598 * but 3 seems a better approximation of cabac. */
1599 if( h->param.b_cabac )
1600 a->l0.i_cost8x8 -= i_ref_cost;
1601 M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
1604 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1607 pixel **p_fenc = h->mb.pic.p_fenc;
1608 ALIGNED_4( int16_t mvc[3][2] );
1610 /* XXX Needed for x264_mb_predict_mv */
1611 h->mb.i_partition = D_16x8;
1613 for( int i = 0; i < 2; i++ )
1615 x264_me_t *l0m = &a->l0.me16x8[i];
1616 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1617 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1618 const int ref8[2] = { minref, maxref };
1619 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1621 m.i_pixel = PIXEL_16x8;
1623 LOAD_FENC( &m, p_fenc, 0, 8*i );
1624 l0m->cost = INT_MAX;
1625 for( int j = 0; j < i_ref8s; j++ )
1627 const int i_ref = ref8[j];
1628 m.i_ref_cost = REF_COST( 0, i_ref );
1630 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1631 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1632 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1633 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1635 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1636 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1638 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1639 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1640 /* We can only take this shortcut if the first search was performed on ref0. */
1641 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1643 /* We can just leave the MV from the previous ref search. */
1644 x264_me_refine_qpel_refdupe( h, &m, NULL );
1647 x264_me_search( h, &m, mvc, 3 );
1649 m.cost += m.i_ref_cost;
1651 if( m.cost < l0m->cost )
1652 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1655 /* Early termination based on the current SATD score of partition[0]
1656 plus the estimated SATD score of partition[1] */
1657 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1659 a->l0.i_cost16x8 = COST_MAX;
1663 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1664 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1667 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1670 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1673 pixel **p_fenc = h->mb.pic.p_fenc;
1674 ALIGNED_4( int16_t mvc[3][2] );
1676 /* XXX Needed for x264_mb_predict_mv */
1677 h->mb.i_partition = D_8x16;
1679 for( int i = 0; i < 2; i++ )
1681 x264_me_t *l0m = &a->l0.me8x16[i];
1682 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1683 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1684 const int ref8[2] = { minref, maxref };
1685 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1687 m.i_pixel = PIXEL_8x16;
1689 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1690 l0m->cost = INT_MAX;
1691 for( int j = 0; j < i_ref8s; j++ )
1693 const int i_ref = ref8[j];
1694 m.i_ref_cost = REF_COST( 0, i_ref );
1696 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1697 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1698 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1700 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1701 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1703 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1704 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1705 /* We can only take this shortcut if the first search was performed on ref0. */
1706 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1708 /* We can just leave the MV from the previous ref search. */
1709 x264_me_refine_qpel_refdupe( h, &m, NULL );
1712 x264_me_search( h, &m, mvc, 3 );
1714 m.cost += m.i_ref_cost;
1716 if( m.cost < l0m->cost )
1717 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1720 /* Early termination based on the current SATD score of partition[0]
1721 plus the estimated SATD score of partition[1] */
1722 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1724 a->l0.i_cost8x16 = COST_MAX;
1728 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1729 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1732 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1735 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
1736 pixel **p_fref, int i8x8, int size, int chroma )
1738 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1739 pixel *pix2 = pix1+8;
1740 int i_stride = h->mb.pic.i_stride[1];
1741 int chroma_h_shift = chroma <= CHROMA_422;
1742 int chroma_v_shift = chroma == CHROMA_420;
1743 int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
1744 int i_ref = a->l0.me8x8[i8x8].i_ref;
1745 int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1746 x264_weight_t *weight = h->sh.weight[i_ref];
1748 // FIXME weight can be done on 4x4 blocks even if mc is smaller
1749 #define CHROMA4x4MC( width, height, me, x, y ) \
1750 if( chroma == CHROMA_444 ) \
1752 int mvx = (me).mv[0] + 4*2*x; \
1753 int mvy = (me).mv[1] + 4*2*y; \
1754 h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
1755 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
1756 h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
1757 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
1761 int offset = x + (2>>chroma_v_shift)*16*y; \
1762 int chroma_height = (2>>chroma_v_shift)*height; \
1763 h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
1764 (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
1765 if( weight[1].weightfn ) \
1766 weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
1767 if( weight[2].weightfn ) \
1768 weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
1771 if( size == PIXEL_4x4 )
1773 x264_me_t *m = a->l0.me4x4[i8x8];
1774 CHROMA4x4MC( 2,2, m[0], 0,0 );
1775 CHROMA4x4MC( 2,2, m[1], 2,0 );
1776 CHROMA4x4MC( 2,2, m[2], 0,2 );
1777 CHROMA4x4MC( 2,2, m[3], 2,2 );
1779 else if( size == PIXEL_8x4 )
1781 x264_me_t *m = a->l0.me8x4[i8x8];
1782 CHROMA4x4MC( 4,2, m[0], 0,0 );
1783 CHROMA4x4MC( 4,2, m[1], 0,2 );
1787 x264_me_t *m = a->l0.me4x8[i8x8];
1788 CHROMA4x4MC( 2,4, m[0], 0,0 );
1789 CHROMA4x4MC( 2,4, m[1], 2,0 );
1793 int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
1794 int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
1795 return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1796 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1799 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1801 if( CHROMA_FORMAT == CHROMA_444 )
1802 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
1803 else if( CHROMA_FORMAT == CHROMA_422 )
1804 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
1806 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
1809 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1811 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1812 pixel **p_fenc = h->mb.pic.p_fenc;
1813 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1815 /* XXX Needed for x264_mb_predict_mv */
1816 h->mb.i_partition = D_8x8;
1818 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1820 const int idx = 4*i8x8 + i4x4;
1821 const int x4 = block_idx_x[idx];
1822 const int y4 = block_idx_y[idx];
1823 const int i_mvc = (i4x4 == 0);
1825 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1827 m->i_pixel = PIXEL_4x4;
1829 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1830 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1831 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1833 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1834 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1836 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1838 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1839 a->l0.me4x4[i8x8][1].cost +
1840 a->l0.me4x4[i8x8][2].cost +
1841 a->l0.me4x4[i8x8][3].cost +
1842 REF_COST( 0, i_ref ) +
1843 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1844 if( h->mb.b_chroma_me )
1845 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1848 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1850 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1851 pixel **p_fenc = h->mb.pic.p_fenc;
1852 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1854 /* XXX Needed for x264_mb_predict_mv */
1855 h->mb.i_partition = D_8x8;
1857 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1859 const int idx = 4*i8x8 + 2*i8x4;
1860 const int x4 = block_idx_x[idx];
1861 const int y4 = block_idx_y[idx];
1862 const int i_mvc = (i8x4 == 0);
1864 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1866 m->i_pixel = PIXEL_8x4;
1868 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1869 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1870 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1872 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1873 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1875 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1877 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1878 REF_COST( 0, i_ref ) +
1879 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1880 if( h->mb.b_chroma_me )
1881 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1884 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1886 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1887 pixel **p_fenc = h->mb.pic.p_fenc;
1888 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1890 /* XXX Needed for x264_mb_predict_mv */
1891 h->mb.i_partition = D_8x8;
1893 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1895 const int idx = 4*i8x8 + i4x8;
1896 const int x4 = block_idx_x[idx];
1897 const int y4 = block_idx_y[idx];
1898 const int i_mvc = (i4x8 == 0);
1900 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1902 m->i_pixel = PIXEL_4x8;
1904 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1905 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1906 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1908 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1909 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1911 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1913 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1914 REF_COST( 0, i_ref ) +
1915 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1916 if( h->mb.b_chroma_me )
1917 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1920 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1922 ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
1923 ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
1924 int i_chroma_cost = 0;
1925 int chromapix = h->luma2chroma_pixel[i_pixel];
1927 #define COST_BI_CHROMA( m0, m1, width, height ) \
1931 h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
1932 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1933 h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
1934 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1935 h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
1936 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1937 h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
1938 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1942 int v_shift = CHROMA_V_SHIFT; \
1943 int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1944 int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1945 h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
1946 m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1947 h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
1948 m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1950 h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1951 h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1952 i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
1953 + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
1956 if( i_pixel == PIXEL_16x16 )
1957 COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
1958 else if( i_pixel == PIXEL_16x8 )
1959 COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
1960 else if( i_pixel == PIXEL_8x16 )
1961 COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
1963 COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
1965 return i_chroma_cost;
1968 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1970 /* Assumes that fdec still contains the results of
1971 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1973 pixel *p_fenc = h->mb.pic.p_fenc[0];
1974 pixel *p_fdec = h->mb.pic.p_fdec[0];
1976 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1977 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1979 int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
1981 for( int i = 0; i < 4; i++ )
1983 const int x = (i&1)*8;
1984 const int y = (i>>1)*8;
1985 a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1986 &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1987 if( h->mb.b_chroma_me )
1989 int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
1990 int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
1991 a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
1992 &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
1993 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
1994 &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
1996 a->i_cost16x16direct += a->i_cost8x8direct[i];
1999 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
2004 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
2005 if( h->mb.b_chroma_me )
2007 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2008 a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
2009 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
2014 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
2016 ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
2017 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
2019 intptr_t stride0 = 16, stride1 = 16;
2021 ALIGNED_4( int16_t mvc[9][2] );
2022 int try_skip = a->b_try_skip;
2023 int list1_skipped = 0;
2024 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
2025 int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
2026 (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
2029 m.i_pixel = PIXEL_16x16;
2031 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
2033 /* 16x16 Search on list 0 and list 1 */
2034 a->l0.me16x16.cost = INT_MAX;
2035 a->l1.me16x16.cost = INT_MAX;
2036 for( int l = 1; l >= 0; )
2038 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2040 /* This loop is extremely munged in order to facilitate the following order of operations,
2041 * necessary for an efficient fast skip.
2042 * 1. Search list1 ref0.
2043 * 2. Search list0 ref0.
2045 * 4. Search the rest of list0.
2046 * 5. Go back and finish list1.
2048 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
2050 if( try_skip && l == 1 && i_ref > 0 )
2056 m.i_ref_cost = REF_COST( l, i_ref );
2058 /* search with ref */
2059 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
2060 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
2061 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
2062 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
2065 m.cost += m.i_ref_cost;
2067 if( m.cost < lX->me16x16.cost )
2068 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
2070 /* save mv for predicting neighbors */
2071 CP32( lX->mvc[i_ref][0], m.mv );
2072 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
2074 /* Fast skip detection. */
2075 if( i_ref == 0 && try_skip )
2077 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
2078 abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
2084 /* We already tested skip */
2085 h->mb.i_type = B_SKIP;
2086 x264_analyse_update_cache( h, a );
2091 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
2093 if( list1_skipped && l == 0 )
2099 /* get cost of BI mode */
2100 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
2101 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
2102 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
2103 src0 = h->mc.get_ref( pix0, &stride0,
2104 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
2105 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
2106 src1 = h->mc.get_ref( pix1, &stride1,
2107 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
2108 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
2110 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2112 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2114 + a->l0.bi16x16.cost_mv
2115 + a->l1.bi16x16.cost_mv;
2117 if( h->mb.b_chroma_me )
2118 a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
2120 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
2121 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
2123 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
2124 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
2125 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
2126 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
2127 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2128 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2129 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2130 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2131 + ref_costs + l0_mv_cost + l1_mv_cost;
2133 if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
2135 ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
2139 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2140 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2141 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2142 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
2143 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2144 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2145 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2146 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
2150 ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
2151 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2152 int v_shift = CHROMA_V_SHIFT;
2154 if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
2156 int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2157 h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2158 h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
2161 h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2162 h->mb.pic.i_stride[1], 16>>v_shift );
2164 if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
2166 int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2167 h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2168 h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
2171 h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2172 h->mb.pic.i_stride[1], 16>>v_shift );
2174 h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
2175 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2176 h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
2177 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2179 cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
2180 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
2184 if( cost00 < a->i_cost16x16bi )
2186 M32( a->l0.bi16x16.mv ) = 0;
2187 M32( a->l1.bi16x16.mv ) = 0;
2188 a->l0.bi16x16.cost_mv = l0_mv_cost;
2189 a->l1.bi16x16.cost_mv = l1_mv_cost;
2190 a->i_cost16x16bi = cost00;
2195 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
2196 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
2197 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2200 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2205 switch( h->mb.i_sub_partition[i] )
2208 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2211 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2212 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2215 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2216 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2219 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2220 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2221 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2222 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2225 x264_log( h, X264_LOG_ERROR, "internal error\n" );
2230 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2234 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2235 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2236 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2237 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2240 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2241 if( x264_mb_partition_listX_table[0][part] ) \
2243 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2244 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2248 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2249 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
2251 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2253 if( x264_mb_partition_listX_table[1][part] ) \
2255 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2256 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2260 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2261 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
2263 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2266 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2270 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2272 x264_mb_load_mv_direct8x8( h, i );
2275 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
2276 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
2277 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2282 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2285 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2287 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2289 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2291 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2295 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2297 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2298 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2300 /* early termination: if 16x16 chose ref 0, then evalute no refs older
2301 * than those used by the neighbors */
2302 #define CHECK_NEIGHBOUR(i)\
2304 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2305 if( ref > i_maxref[l] )\
2309 for( int l = 0; l < 2; l++ )
2311 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2312 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2313 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2316 CHECK_NEIGHBOUR( -8 - 1 );
2317 CHECK_NEIGHBOUR( -8 + 0 );
2318 CHECK_NEIGHBOUR( -8 + 2 );
2319 CHECK_NEIGHBOUR( -8 + 4 );
2320 CHECK_NEIGHBOUR( 0 - 1 );
2321 CHECK_NEIGHBOUR( 2*8 - 1 );
2325 /* XXX Needed for x264_mb_predict_mv */
2326 h->mb.i_partition = D_8x8;
2330 for( int i = 0; i < 4; i++ )
2336 intptr_t stride[2] = {8,8};
2339 m.i_pixel = PIXEL_8x8;
2340 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2342 for( int l = 0; l < 2; l++ )
2344 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2346 lX->me8x8[i].cost = INT_MAX;
2347 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2349 m.i_ref_cost = REF_COST( l, i_ref );
2351 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2353 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2354 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2355 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2356 m.cost += m.i_ref_cost;
2358 if( m.cost < lX->me8x8[i].cost )
2360 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2361 a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2364 /* save mv for predicting other partitions within this MB */
2365 CP32( lX->mvc[i_ref][i+1], m.mv );
2370 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2371 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
2372 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2373 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
2374 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2375 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2377 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2378 i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2379 + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2380 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2382 if( h->mb.b_chroma_me )
2384 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2385 i_part_cost_bi += i_chroma_cost;
2386 a->i_satd8x8[2][i] += i_chroma_cost;
2389 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2390 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2392 i_part_cost = a->l0.me8x8[i].cost;
2393 h->mb.i_sub_partition[i] = D_L0_8x8;
2394 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2395 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2396 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2397 a->i_cost8x8bi += i_part_cost;
2399 /* XXX Needed for x264_mb_predict_mv */
2400 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2404 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2407 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2410 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2411 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2412 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2414 /* XXX Needed for x264_mb_predict_mv */
2415 h->mb.i_partition = D_8x8;
2419 for( int i = 0; i < 4; i++ )
2424 int i_part_cost_bi = 0;
2425 intptr_t stride[2] = {8,8};
2428 for( int l = 0; l < 2; l++ )
2430 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2431 x264_me_t *m = &lX->me8x8[i];
2432 m->i_pixel = PIXEL_8x8;
2433 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2435 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2436 m->i_ref = lX->me16x16.i_ref;
2438 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2440 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2441 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2442 x264_me_search( h, m, &lX->me16x16.mv, 1 );
2443 a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2444 m->cost += m->i_ref_cost;
2446 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2448 /* save mv for predicting other partitions within this MB */
2449 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2452 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2453 m->mv[0], m->mv[1], 8, 8, x264_weight_none );
2454 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2456 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2457 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2458 i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2459 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2460 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2462 if( h->mb.b_chroma_me )
2464 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2465 i_part_cost_bi += i_chroma_cost;
2466 a->i_satd8x8[2][i] += i_chroma_cost;
2469 i_part_cost = a->l0.me8x8[i].cost;
2470 h->mb.i_sub_partition[i] = D_L0_8x8;
2471 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2472 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2473 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2474 a->i_cost8x8bi += i_part_cost;
2476 /* XXX Needed for x264_mb_predict_mv */
2477 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2481 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2484 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2486 ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
2487 ALIGNED_4( int16_t mvc[3][2] );
2489 h->mb.i_partition = D_16x8;
2490 a->i_cost16x8bi = 0;
2492 for( int i = 0; i < 2; i++ )
2495 int i_part_cost_bi = 0;
2496 intptr_t stride[2] = {16,16};
2499 m.i_pixel = PIXEL_16x8;
2500 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2502 for( int l = 0; l < 2; l++ )
2504 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2505 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2506 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2507 lX->me16x8[i].cost = INT_MAX;
2508 for( int j = 0; j < i_ref8s; j++ )
2510 int i_ref = ref8[j];
2511 m.i_ref_cost = REF_COST( l, i_ref );
2513 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2515 CP32( mvc[0], lX->mvc[i_ref][0] );
2516 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2517 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2519 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2520 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2521 x264_me_search( h, &m, mvc, 3 );
2522 m.cost += m.i_ref_cost;
2524 if( m.cost < lX->me16x8[i].cost )
2525 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2530 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2531 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
2532 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2533 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
2534 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2535 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2537 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2538 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2539 + a->l1.me16x8[i].i_ref_cost;
2541 if( h->mb.b_chroma_me )
2542 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2544 i_part_cost = a->l0.me16x8[i].cost;
2545 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2547 if( a->l1.me16x8[i].cost < i_part_cost )
2549 i_part_cost = a->l1.me16x8[i].cost;
2550 a->i_mb_partition16x8[i] = D_L1_8x8;
2552 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2554 i_part_cost = i_part_cost_bi;
2555 a->i_mb_partition16x8[i] = D_BI_8x8;
2557 a->i_cost16x8bi += i_part_cost;
2559 /* Early termination based on the current SATD score of partition[0]
2560 plus the estimated SATD score of partition[1] */
2561 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2562 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2564 a->i_cost16x8bi = COST_MAX;
2568 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2572 a->i_mb_type16x8 = B_L0_L0
2573 + (a->i_mb_partition16x8[0]>>2) * 3
2574 + (a->i_mb_partition16x8[1]>>2);
2575 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2578 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2580 ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2581 ALIGNED_4( int16_t mvc[3][2] );
2583 h->mb.i_partition = D_8x16;
2584 a->i_cost8x16bi = 0;
2586 for( int i = 0; i < 2; i++ )
2589 int i_part_cost_bi = 0;
2590 intptr_t stride[2] = {8,8};
2593 m.i_pixel = PIXEL_8x16;
2594 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2596 for( int l = 0; l < 2; l++ )
2598 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2599 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2600 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2601 lX->me8x16[i].cost = INT_MAX;
2602 for( int j = 0; j < i_ref8s; j++ )
2604 int i_ref = ref8[j];
2605 m.i_ref_cost = REF_COST( l, i_ref );
2607 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2609 CP32( mvc[0], lX->mvc[i_ref][0] );
2610 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2611 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2613 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2614 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2615 x264_me_search( h, &m, mvc, 3 );
2616 m.cost += m.i_ref_cost;
2618 if( m.cost < lX->me8x16[i].cost )
2619 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2624 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2625 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
2626 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2627 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
2628 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2630 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2631 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2632 + a->l1.me8x16[i].i_ref_cost;
2634 if( h->mb.b_chroma_me )
2635 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2637 i_part_cost = a->l0.me8x16[i].cost;
2638 a->i_mb_partition8x16[i] = D_L0_8x8;
2640 if( a->l1.me8x16[i].cost < i_part_cost )
2642 i_part_cost = a->l1.me8x16[i].cost;
2643 a->i_mb_partition8x16[i] = D_L1_8x8;
2645 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2647 i_part_cost = i_part_cost_bi;
2648 a->i_mb_partition8x16[i] = D_BI_8x8;
2650 a->i_cost8x16bi += i_part_cost;
2652 /* Early termination based on the current SATD score of partition[0]
2653 plus the estimated SATD score of partition[1] */
2654 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2655 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2657 a->i_cost8x16bi = COST_MAX;
2661 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2665 a->i_mb_type8x16 = B_L0_L0
2666 + (a->i_mb_partition8x16[0]>>2) * 3
2667 + (a->i_mb_partition8x16[1]>>2);
2668 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2671 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2673 int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
2675 h->mb.i_type = P_L0;
2676 if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
2678 h->mb.i_partition = D_16x16;
2679 x264_analyse_update_cache( h, a );
2680 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2683 if( a->l0.i_cost16x8 < thresh )
2685 h->mb.i_partition = D_16x8;
2686 x264_analyse_update_cache( h, a );
2687 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2690 a->l0.i_cost16x8 = COST_MAX;
2692 if( a->l0.i_cost8x16 < thresh )
2694 h->mb.i_partition = D_8x16;
2695 x264_analyse_update_cache( h, a );
2696 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2699 a->l0.i_cost8x16 = COST_MAX;
2701 if( a->l0.i_cost8x8 < thresh )
2703 h->mb.i_type = P_8x8;
2704 h->mb.i_partition = D_8x8;
2705 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2707 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2708 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2709 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2710 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2711 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2712 * for future blocks are those left over from previous RDO calls. */
2713 for( int i = 0; i < 4; i++ )
2715 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2716 int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
2717 int subtype, btype = D_L0_8x8;
2718 uint64_t bcost = COST_MAX64;
2719 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2722 if( costs[subtype] > sub8x8_thresh )
2724 h->mb.i_sub_partition[i] = subtype;
2725 x264_mb_cache_mv_p8x8( h, a, i );
2726 if( subtype == btype )
2728 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2729 COPY2_IF_LT( bcost, cost, btype, subtype );
2731 if( h->mb.i_sub_partition[i] != btype )
2733 h->mb.i_sub_partition[i] = btype;
2734 x264_mb_cache_mv_p8x8( h, a, i );
2739 x264_analyse_update_cache( h, a );
2740 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2743 a->l0.i_cost8x8 = COST_MAX;
2746 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2748 int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
2750 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2752 h->mb.i_type = B_DIRECT;
2753 /* Assumes direct/skip MC is still in fdec */
2754 /* Requires b-rdo to be done before intra analysis */
2755 h->mb.b_skip_mc = 1;
2756 x264_analyse_update_cache( h, a );
2757 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2758 h->mb.b_skip_mc = 0;
2761 //FIXME not all the update_cache calls are needed
2762 h->mb.i_partition = D_16x16;
2764 if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
2766 h->mb.i_type = B_L0_L0;
2767 x264_analyse_update_cache( h, a );
2768 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2772 if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
2774 h->mb.i_type = B_L1_L1;
2775 x264_analyse_update_cache( h, a );
2776 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2780 if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
2782 h->mb.i_type = B_BI_BI;
2783 x264_analyse_update_cache( h, a );
2784 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2788 if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
2790 h->mb.i_type = B_8x8;
2791 h->mb.i_partition = D_8x8;
2792 x264_analyse_update_cache( h, a );
2793 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2794 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2798 if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
2800 h->mb.i_type = a->i_mb_type16x8;
2801 h->mb.i_partition = D_16x8;
2802 x264_analyse_update_cache( h, a );
2803 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2807 if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
2809 h->mb.i_type = a->i_mb_type8x16;
2810 h->mb.i_partition = D_8x16;
2811 x264_analyse_update_cache( h, a );
2812 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2816 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2820 if( IS_INTRA(h->mb.i_type) )
2823 switch( h->mb.i_partition )
2826 if( h->mb.i_type == B_BI_BI )
2828 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2829 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2833 for( int i = 0; i < 2; i++ )
2834 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2836 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2837 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2841 for( int i = 0; i < 2; i++ )
2842 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2844 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2845 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2849 for( int i = 0; i < 4; i++ )
2850 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2852 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2853 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2859 static inline void x264_mb_analyse_transform( x264_t *h )
2861 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2863 /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
2866 int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
2867 int i_cost8 = 0, i_cost4 = 0;
2868 /* Not all platforms have a merged SATD function */
2869 if( h->pixf.sa8d_satd[PIXEL_16x16] )
2872 for( int p = 0; p < plane_count; p++ )
2874 cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2875 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2878 i_cost8 = (uint32_t)cost;
2879 i_cost4 = (uint32_t)(cost >> 32);
2883 for( int p = 0; p < plane_count; p++ )
2885 i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2886 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2887 i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2888 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2892 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2893 h->mb.b_skip_mc = 1;
2897 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2899 if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
2901 uint32_t subpart_bak = M32( h->mb.i_sub_partition );
2902 /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
2903 if( h->mb.i_type == P_8x8 )
2904 M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
2905 else if( !x264_transform_allowed[h->mb.i_type] )
2908 x264_analyse_update_cache( h, a );
2909 h->mb.b_transform_8x8 ^= 1;
2910 /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
2911 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2913 if( *i_rd >= i_rd8 )
2916 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2921 h->mb.b_transform_8x8 ^= 1;
2922 M32( h->mb.i_sub_partition ) = subpart_bak;
2927 /* Rate-distortion optimal QP selection.
2928 * FIXME: More than half of the benefit of this function seems to be
2929 * in the way it improves the coding of chroma DC (by decimating or
2930 * finding a better way to code a single DC coefficient.)
2931 * There must be a more efficient way to get that portion of the benefit
2932 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2934 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2936 int bcost, cost, failures, prevcost, origcost;
2937 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2938 int last_qp_tried = 0;
2939 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2940 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2942 /* If CBP is already zero, don't raise the quantizer any higher. */
2943 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2945 /* Without psy-RD, require monotonicity when moving quant away from previous
2946 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2947 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2948 * allow 2 failures when moving quant towards previous quant.
2949 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2950 int threshold = (!!h->mb.i_psy_rd);
2951 /* Raise the threshold for failures if we're moving towards the last QP. */
2952 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2953 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2955 h->mb.i_qp = orig_qp;
2957 prevcost = origcost;
2959 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2960 * (up to a point) will too. So, jump down to where the threshold will kick in
2961 * and check the QP there. If the CBP is still empty, skip the main loop.
2962 * If it isn't empty, we would have ended up having to check this QP anyways,
2963 * so as long as we store it for later lookup, we lose nothing. */
2964 int already_checked_qp = -1;
2965 int already_checked_cost = COST_MAX;
2966 if( direction == -1 )
2970 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
2971 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2972 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2973 if( !h->mb.cbp[h->mb.i_mb_xy] )
2975 /* If our empty-CBP block is lower QP than the last QP,
2976 * the last QP almost surely doesn't have a CBP either. */
2977 if( h->mb.i_last_qp > h->mb.i_qp )
2981 already_checked_qp = h->mb.i_qp;
2982 h->mb.i_qp = orig_qp;
2986 h->mb.i_qp += direction;
2987 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2989 if( h->mb.i_last_qp == h->mb.i_qp )
2991 if( h->mb.i_qp == already_checked_qp )
2992 cost = already_checked_cost;
2995 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2996 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2997 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3000 /* We can't assume that the costs are monotonic over QPs.
3001 * Tie case-as-failure seems to give better results. */
3002 if( cost < prevcost )
3008 if( failures > threshold )
3010 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
3012 h->mb.i_qp += direction;
3016 /* Always try the last block's QP. */
3017 if( !last_qp_tried )
3019 h->mb.i_qp = h->mb.i_last_qp;
3020 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3021 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3022 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3026 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3028 /* Check transform again; decision from before may no longer be optimal. */
3029 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
3030 x264_mb_transform_8x8_allowed( h ) )
3032 h->mb.b_transform_8x8 ^= 1;
3033 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3035 h->mb.b_transform_8x8 ^= 1;
3039 /*****************************************************************************
3040 * x264_macroblock_analyse:
3041 *****************************************************************************/
3042 void x264_macroblock_analyse( x264_t *h )
3044 x264_mb_analysis_t analysis;
3045 int i_cost = COST_MAX;
3047 h->mb.i_qp = x264_ratecontrol_mb_qp( h );
3048 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
3049 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
3050 if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
3051 h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
3053 if( h->param.analyse.b_mb_info )
3054 h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
3055 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
3057 /*--------------------------- Do the analysis ---------------------------*/
3058 if( h->sh.i_type == SLICE_TYPE_I )
3061 if( analysis.i_mbrd )
3062 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3063 x264_mb_analyse_intra( h, &analysis, COST_MAX );
3064 if( analysis.i_mbrd )
3065 x264_intra_rd( h, &analysis, COST_MAX );
3067 i_cost = analysis.i_satd_i16x16;
3068 h->mb.i_type = I_16x16;
3069 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
3070 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
3071 if( analysis.i_satd_pcm < i_cost )
3072 h->mb.i_type = I_PCM;
3074 else if( analysis.i_mbrd >= 2 )
3075 x264_intra_rd_refine( h, &analysis );
3077 else if( h->sh.i_type == SLICE_TYPE_P )
3081 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
3083 analysis.b_try_skip = 0;
3084 if( analysis.b_force_intra )
3086 if( !h->param.analyse.b_psy )
3088 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3089 goto intra_analysis;
3094 /* Special fast-skip logic using information from mb_info. */
3095 if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
3097 if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
3098 h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
3100 h->mb.i_partition = D_16x16;
3101 /* Use the P-SKIP MV if we can... */
3102 if( !M32(h->mb.cache.pskip_mv) )
3105 h->mb.i_type = P_SKIP;
3107 /* Otherwise, just force a 16x16 block. */
3110 h->mb.i_type = P_L0;
3111 analysis.l0.me16x16.i_ref = 0;
3112 M32( analysis.l0.me16x16.mv ) = 0;
3116 /* Reset the information accordingly */
3117 else if( h->param.analyse.b_mb_info_update )
3118 h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
3121 int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
3122 /* If the current macroblock is off the frame, just skip it. */
3123 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
3125 /* Fast P_SKIP detection */
3126 else if( h->param.analyse.b_fast_pskip )
3129 // FIXME don't need to check this if the reference frame is done
3131 else if( h->param.analyse.i_subpel_refine >= 3 )
3132 analysis.b_try_skip = 1;
3133 else if( h->mb.i_mb_type_left[0] == P_SKIP ||
3134 h->mb.i_mb_type_top == P_SKIP ||
3135 h->mb.i_mb_type_topleft == P_SKIP ||
3136 h->mb.i_mb_type_topright == P_SKIP )
3137 b_skip = x264_macroblock_probe_pskip( h );
3141 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
3145 h->mb.i_type = P_SKIP;
3146 h->mb.i_partition = D_16x16;
3147 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
3149 /* Set up MVs for future predictors */
3150 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3151 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3155 const unsigned int flags = h->param.analyse.inter;
3158 int i_satd_inter, i_satd_intra;
3160 x264_mb_analyse_load_costs( h, &analysis );
3162 x264_mb_analyse_inter_p16x16( h, &analysis );
3164 if( h->mb.i_type == P_SKIP )
3166 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3167 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3171 if( flags & X264_ANALYSE_PSUB16x16 )
3173 if( h->param.analyse.b_mixed_references )
3174 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
3176 x264_mb_analyse_inter_p8x8( h, &analysis );
3179 /* Select best inter mode */
3181 i_partition = D_16x16;
3182 i_cost = analysis.l0.me16x16.cost;
3184 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3185 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
3188 i_partition = D_8x8;
3189 i_cost = analysis.l0.i_cost8x8;
3192 if( flags & X264_ANALYSE_PSUB8x8 )
3194 for( int i = 0; i < 4; i++ )
3196 x264_mb_analyse_inter_p4x4( h, &analysis, i );
3197 int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
3198 if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
3200 int i_cost8x8 = analysis.l0.i_cost4x4[i];
3201 h->mb.i_sub_partition[i] = D_L0_4x4;
3203 x264_mb_analyse_inter_p8x4( h, &analysis, i );
3204 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
3205 h->mb.i_sub_partition[i], D_L0_8x4 );
3207 x264_mb_analyse_inter_p4x8( h, &analysis, i );
3208 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
3209 h->mb.i_sub_partition[i], D_L0_4x8 );
3211 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
3213 x264_mb_cache_mv_p8x8( h, &analysis, i );
3215 analysis.l0.i_cost8x8 = i_cost;
3219 /* Now do 16x8/8x16 */
3220 int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
3221 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3222 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
3224 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
3225 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3226 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3228 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
3229 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
3231 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
3232 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3233 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3235 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
3236 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
3239 h->mb.i_partition = i_partition;
3242 //FIXME mb_type costs?
3243 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3247 else if( i_partition == D_16x16 )
3249 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3250 i_cost = analysis.l0.me16x16.cost;
3252 else if( i_partition == D_16x8 )
3254 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
3255 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
3256 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
3258 else if( i_partition == D_8x16 )
3260 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
3261 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
3262 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
3264 else if( i_partition == D_8x8 )
3267 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3269 switch( h->mb.i_sub_partition[i8x8] )
3272 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3273 i_cost += analysis.l0.me8x8[i8x8].cost;
3276 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3277 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3278 i_cost += analysis.l0.me8x4[i8x8][0].cost +
3279 analysis.l0.me8x4[i8x8][1].cost;
3282 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3283 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3284 i_cost += analysis.l0.me4x8[i8x8][0].cost +
3285 analysis.l0.me4x8[i8x8][1].cost;
3289 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3290 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3291 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3292 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3293 i_cost += analysis.l0.me4x4[i8x8][0].cost +
3294 analysis.l0.me4x4[i8x8][1].cost +
3295 analysis.l0.me4x4[i8x8][2].cost +
3296 analysis.l0.me4x4[i8x8][3].cost;
3299 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3305 if( h->mb.b_chroma_me )
3309 x264_mb_analyse_intra( h, &analysis, i_cost );
3310 x264_mb_analyse_intra_chroma( h, &analysis );
3314 x264_mb_analyse_intra_chroma( h, &analysis );
3315 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
3317 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3318 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3319 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3322 x264_mb_analyse_intra( h, &analysis, i_cost );
3324 i_satd_inter = i_cost;
3325 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3326 analysis.i_satd_i8x8,
3327 analysis.i_satd_i4x4 );
3329 if( analysis.i_mbrd )
3331 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3333 i_partition = D_16x16;
3334 i_cost = analysis.l0.i_rd16x16;
3335 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3336 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3337 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3338 h->mb.i_type = i_type;
3339 h->mb.i_partition = i_partition;
3340 if( i_cost < COST_MAX )
3341 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3342 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
3345 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3346 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3347 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3348 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3350 h->mb.i_type = i_type;
3352 if( analysis.b_force_intra && !IS_INTRA(i_type) )
3354 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3355 * it was an inter block. */
3356 x264_analyse_update_cache( h, &analysis );
3357 x264_macroblock_encode( h );
3358 for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
3359 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
3362 int height = 16 >> CHROMA_V_SHIFT;
3363 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
3364 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
3366 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3367 goto intra_analysis;
3370 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3372 if( IS_INTRA( h->mb.i_type ) )
3374 x264_intra_rd_refine( h, &analysis );
3376 else if( i_partition == D_16x16 )
3378 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3379 analysis.l0.me16x16.cost = i_cost;
3380 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3382 else if( i_partition == D_16x8 )
3384 M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
3385 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3386 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3387 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3388 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3390 else if( i_partition == D_8x16 )
3392 M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
3393 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3394 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3395 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3396 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3398 else if( i_partition == D_8x8 )
3400 x264_analyse_update_cache( h, &analysis );
3401 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3403 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3405 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3407 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3409 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3410 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3412 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3414 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3415 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3417 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3419 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3420 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3421 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3422 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3429 else if( h->sh.i_type == SLICE_TYPE_B )
3431 int i_bskip_cost = COST_MAX;
3434 if( analysis.i_mbrd )
3435 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3437 h->mb.i_type = B_SKIP;
3438 if( h->mb.b_direct_auto_write )
3440 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3441 for( int i = 0; i < 2; i++ )
3444 h->sh.b_direct_spatial_mv_pred ^= 1;
3445 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3446 if( analysis.b_direct_available )
3451 b_skip = x264_macroblock_probe_bskip( h );
3453 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3460 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3462 analysis.b_try_skip = 0;
3463 if( analysis.b_direct_available )
3465 if( !h->mb.b_direct_auto_write )
3467 /* If the current macroblock is off the frame, just skip it. */
3468 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
3470 else if( analysis.i_mbrd )
3472 i_bskip_cost = ssd_mb( h );
3473 /* 6 = minimum cavlc cost of a non-skipped MB */
3474 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3476 else if( !h->mb.b_direct_auto_write )
3478 /* Conditioning the probe on neighboring block types
3479 * doesn't seem to help speed or quality. */
3480 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3481 if( h->param.analyse.i_subpel_refine < 3 )
3482 b_skip = analysis.b_try_skip;
3484 /* Set up MVs for future predictors */
3487 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3488 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3489 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3490 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3496 const unsigned int flags = h->param.analyse.inter;
3500 h->mb.b_skip_mc = 0;
3501 h->mb.i_type = B_DIRECT;
3503 x264_mb_analyse_load_costs( h, &analysis );
3505 /* select best inter mode */
3506 /* direct must be first */
3507 if( analysis.b_direct_available )
3508 x264_mb_analyse_inter_direct( h, &analysis );
3510 x264_mb_analyse_inter_b16x16( h, &analysis );
3512 if( h->mb.i_type == B_SKIP )
3514 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3515 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3516 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3517 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3522 i_partition = D_16x16;
3523 i_cost = analysis.l0.me16x16.cost;
3524 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3525 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3526 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3528 if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
3530 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3531 if( i_bskip_cost < analysis.i_rd16x16direct &&
3532 i_bskip_cost < analysis.i_rd16x16bi &&
3533 i_bskip_cost < analysis.l0.i_rd16x16 &&
3534 i_bskip_cost < analysis.l1.i_rd16x16 )
3536 h->mb.i_type = B_SKIP;
3537 x264_analyse_update_cache( h, &analysis );
3542 if( flags & X264_ANALYSE_BSUB16x16 )
3544 if( h->param.analyse.b_mixed_references )
3545 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3547 x264_mb_analyse_inter_b8x8( h, &analysis );
3549 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3551 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3552 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3553 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3554 for( int i = 0; i < 2; i++ )
3556 int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3557 int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3559 i_best_cost = COST_MAX;
3560 i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3561 i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3562 i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3563 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3564 + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3565 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3566 + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3567 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3568 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3569 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3570 analysis.i_cost_est16x8[i] = i_best_cost;
3573 i_best_cost = COST_MAX;
3574 i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3575 i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3576 i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3577 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3578 + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3579 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3580 + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3581 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3582 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3583 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3584 analysis.i_cost_est8x16[i] = i_best_cost;
3586 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3587 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3588 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3589 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3590 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3591 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3593 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3594 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3595 if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3597 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3598 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3600 if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
3602 x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3603 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3605 if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3607 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3608 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3612 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3617 else if( i_partition == D_16x16 )
3619 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3620 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3621 if( i_type == B_L0_L0 )
3623 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3624 i_cost = analysis.l0.me16x16.cost
3625 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3627 else if( i_type == B_L1_L1 )
3629 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3630 i_cost = analysis.l1.me16x16.cost
3631 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3633 else if( i_type == B_BI_BI )
3635 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3636 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3639 else if( i_partition == D_16x8 )
3641 for( int i = 0; i < 2; i++ )
3643 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3644 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3645 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3646 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3649 else if( i_partition == D_8x16 )
3651 for( int i = 0; i < 2; i++ )
3653 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3654 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3655 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3656 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3659 else if( i_partition == D_8x8 )
3661 for( int i = 0; i < 4; i++ )
3664 int i_part_cost_old;
3666 int i_part_type = h->mb.i_sub_partition[i];
3667 int b_bidir = (i_part_type == D_BI_8x8);
3669 if( i_part_type == D_DIRECT_8x8 )
3671 if( x264_mb_partition_listX_table[0][i_part_type] )
3673 m = &analysis.l0.me8x8[i];
3674 i_part_cost_old = m->cost;
3675 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3676 m->cost -= i_type_cost;
3677 x264_me_refine_qpel( h, m );
3679 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3681 if( x264_mb_partition_listX_table[1][i_part_type] )
3683 m = &analysis.l1.me8x8[i];
3684 i_part_cost_old = m->cost;
3685 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3686 m->cost -= i_type_cost;
3687 x264_me_refine_qpel( h, m );
3689 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3691 /* TODO: update mvp? */
3695 i_satd_inter = i_cost;
3697 if( analysis.i_mbrd )
3699 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3701 i_cost = i_bskip_cost;
3702 i_partition = D_16x16;
3703 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3704 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3705 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3706 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3707 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3708 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3709 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3711 h->mb.i_type = i_type;
3712 h->mb.i_partition = i_partition;
3715 if( h->mb.b_chroma_me )
3719 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3720 x264_mb_analyse_intra_chroma( h, &analysis );
3724 x264_mb_analyse_intra_chroma( h, &analysis );
3725 x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
3727 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3728 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3729 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3732 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3734 if( analysis.i_mbrd )
3736 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3737 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
3740 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3741 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3742 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3743 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3745 h->mb.i_type = i_type;
3746 h->mb.i_partition = i_partition;
3748 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3749 x264_intra_rd_refine( h, &analysis );
3750 if( h->mb.i_subpel_refine >= 5 )
3751 x264_refine_bidir( h, &analysis );
3753 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3756 x264_analyse_update_cache( h, &analysis );
3758 if( i_partition == D_16x16 )
3760 if( i_type == B_L0_L0 )
3762 analysis.l0.me16x16.cost = i_cost;
3763 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3765 else if( i_type == B_L1_L1 )
3767 analysis.l1.me16x16.cost = i_cost;
3768 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3770 else if( i_type == B_BI_BI )
3772 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3773 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3776 else if( i_partition == D_16x8 )
3778 for( int i = 0; i < 2; i++ )
3780 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3781 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3782 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3783 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3784 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3785 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3787 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3788 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3792 else if( i_partition == D_8x16 )
3794 for( int i = 0; i < 2; i++ )
3796 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3797 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3798 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3799 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3800 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3801 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3803 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3804 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3808 else if( i_partition == D_8x8 )
3810 for( int i = 0; i < 4; i++ )
3812 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3813 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3814 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3815 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3816 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3818 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3819 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3827 x264_analyse_update_cache( h, &analysis );
3829 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3830 * without realizing it. Check for this and account for it if necessary. */
3831 if( analysis.i_mbrd >= 2 )
3833 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3834 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3835 int list = check_mv_lists[h->mb.i_type] - 1;
3836 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3837 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3838 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3839 h->mb.i_partition = D_16x16;
3842 if( !analysis.i_mbrd )
3843 x264_mb_analyse_transform( h );
3845 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3846 x264_mb_analyse_qp_rd( h, &analysis );
3848 h->mb.b_trellis = h->param.analyse.i_trellis;
3849 h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3851 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3852 x264_psy_trellis_init( h, 0 );
3853 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3854 h->mb.i_skip_intra = 0;
3857 /*-------------------- Update MB from the analysis ----------------------*/
3858 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3860 switch( h->mb.i_type )
3863 for( int i = 0; i < 16; i++ )
3864 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3866 x264_mb_analyse_intra_chroma( h, a );
3869 for( int i = 0; i < 4; i++ )
3870 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3872 x264_mb_analyse_intra_chroma( h, a );
3875 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3876 x264_mb_analyse_intra_chroma( h, a );
3883 switch( h->mb.i_partition )
3886 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3887 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3891 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3892 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3893 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3894 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3898 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3899 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3900 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3901 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3905 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3911 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3912 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3913 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3914 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3915 for( int i = 0; i < 4; i++ )
3916 x264_mb_cache_mv_p8x8( h, a, i );
3921 h->mb.i_partition = D_16x16;
3922 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3923 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3929 h->mb.i_partition = h->mb.cache.direct_partition;
3930 x264_mb_load_mv_direct8x8( h, 0 );
3931 x264_mb_load_mv_direct8x8( h, 1 );
3932 x264_mb_load_mv_direct8x8( h, 2 );
3933 x264_mb_load_mv_direct8x8( h, 3 );
3937 /* optimize: cache might not need to be rewritten */
3938 for( int i = 0; i < 4; i++ )
3939 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3942 default: /* the rest of the B types */
3943 switch( h->mb.i_partition )
3946 switch( h->mb.i_type )
3949 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3950 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3952 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3953 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3954 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3957 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3958 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3959 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3961 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3962 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3965 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3966 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3968 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3969 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3974 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3975 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3978 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3979 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3982 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3988 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3990 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3993 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3996 completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
3997 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
3999 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
4000 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
4001 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
4002 h->mb.cache.mv[l][x264_scan8[15]][0],
4003 h->mb.cache.mv[l][x264_scan8[15]][1] );
4004 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
4005 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
4006 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
4007 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
4008 x264_mb_analyse_intra( h, a, COST_MAX );
4009 h->mb.i_type = I_16x16;
4010 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
4011 x264_mb_analyse_intra_chroma( h, a );
4018 #include "slicetype.c"