1 /*****************************************************************************
2 * analyse.c: macroblock analysis
3 *****************************************************************************
4 * Copyright (C) 2003-2013 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
28 #define _ISOC99_SOURCE
30 #include "common/common.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
87 int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
92 int i_satd_i16x16_dir[7];
97 ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
101 int i_predict4x4[16];
107 int i_satd_chroma_dir[7];
108 int i_predict8x8chroma;
110 /* II: Inter part P/B frame */
111 x264_mb_analysis_list_t l0;
112 x264_mb_analysis_list_t l1;
114 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115 int i_cost16x16direct;
117 int i_cost8x8direct[4];
118 int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
119 int i_cost_est16x8[2]; /* Per-partition estimated cost */
120 int i_cost_est8x16[2];
129 int i_mb_partition16x8[2]; /* mb_partition_e */
130 int i_mb_partition8x16[2];
131 int i_mb_type16x8; /* mb_class_e */
134 int b_direct_available;
135 int b_early_terminate;
137 } x264_mb_analysis_t;
139 /* lambda = pow(2,qp/6-2) */
140 const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
142 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */
143 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */
144 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */
145 4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */
146 10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */
147 25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */
148 64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */
149 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
150 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
151 1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
152 2048,2299, /* 80-81 */
155 /* lambda2 = pow(lambda,2) * .9 * 256 */
156 /* Capped to avoid overflow */
157 const int x264_lambda2_tab[QP_MAX_MAX+1] =
159 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */
160 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */
161 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */
162 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */
163 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */
164 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */
165 943718, 1189010, 1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
166 5992238, 7549747, 9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
167 38048341, 47937906, 60397977, 76096683, 95875813,120795955, /* 64-69 */
168 134217727,134217727,134217727,134217727,134217727,134217727, /* 70-75 */
169 134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */
172 const uint8_t x264_exp2_lut[64] =
174 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
175 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
176 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
177 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
180 const float x264_log2_lut[128] =
182 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
183 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
184 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
185 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
186 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
187 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
188 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
189 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
190 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
191 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
192 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
193 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
194 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
195 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
196 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
197 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
200 /* Avoid an int/float conversion. */
201 const float x264_log2_lz_lut[32] =
203 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
206 // should the intra and inter lambdas be different?
207 // I'm just matching the behaviour of deadzone quant.
208 static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
210 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
212 46, 58, 73, 92, 117, 147,
213 185, 233, 294, 370, 466, 587,
214 740, 932, 1174, 1480, 1864, 2349,
215 2959, 3728, 4697, 5918, 7457, 9395,
216 11837, 14914, 18790, 23674, 29828, 37581,
217 47349, 59656, 75163, 94699, 119313, 150326,
218 189399, 238627, 300652, 378798, 477255, 601304,
219 757596, 954511, 1202608, 1515192, 1909022, 2405217,
220 3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
221 12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
222 48486154, 61088726, 76966972, 96972308,
223 122177453,134217727,134217727,134217727,134217727,134217727,
224 134217727,134217727,134217727,134217727,134217727,134217727,
226 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
228 27, 34, 43, 54, 68, 86,
229 108, 136, 172, 216, 273, 343,
230 433, 545, 687, 865, 1090, 1374,
231 1731, 2180, 2747, 3461, 4361, 5494,
232 6922, 8721, 10988, 13844, 17442, 21976,
233 27688, 34885, 43953, 55377, 69771, 87906,
234 110755, 139543, 175813, 221511, 279087, 351627,
235 443023, 558174, 703255, 886046, 1116348, 1406511,
236 1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
237 7088374, 8930791, 11252092, 14176748, 17861583, 22504184,
238 28353495, 35723165, 45008368, 56706990,
239 71446330, 90016736,113413980,134217727,134217727,134217727,
240 134217727,134217727,134217727,134217727,134217727,134217727,
241 134217727,134217727,134217727,134217727,134217727,134217727,
245 #define MAX_CHROMA_LAMBDA_OFFSET 36
246 static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
248 16, 20, 25, 32, 40, 50,
249 64, 80, 101, 128, 161, 203,
250 256, 322, 406, 512, 645, 812,
251 1024, 1290, 1625, 2048, 2580, 3250,
252 4096, 5160, 6501, 8192, 10321, 13003,
253 16384, 20642, 26007, 32768, 41285, 52015,
257 /* TODO: calculate CABAC costs */
258 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
260 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
262 static const uint8_t i_mb_b16x8_cost_table[17] =
264 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
266 static const uint8_t i_sub_mb_b_cost_table[13] =
268 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
270 static const uint8_t i_sub_mb_p_cost_table[4] =
275 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
277 static uint16_t x264_cost_ref[QP_MAX+1][3][33];
278 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
279 static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
281 float *x264_analyse_prepare_costs( x264_t *h )
283 float *logs = x264_malloc( (2*4*2048+1)*sizeof(float) );
287 for( int i = 1; i <= 2*4*2048; i++ )
288 logs[i] = log2f(i+1)*2 + 1.718f;
292 int x264_analyse_init_costs( x264_t *h, float *logs, int qp )
294 int lambda = x264_lambda_tab[qp];
297 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
298 CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
299 h->cost_mv[qp] += 2*4*2048;
300 for( int i = 0; i <= 2*4*2048; i++ )
303 h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
305 x264_pthread_mutex_lock( &cost_ref_mutex );
306 for( int i = 0; i < 3; i++ )
307 for( int j = 0; j < 33; j++ )
308 x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
309 x264_pthread_mutex_unlock( &cost_ref_mutex );
310 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
312 for( int j = 0; j < 4; j++ )
314 CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
315 h->cost_mv_fpel[qp][j] += 2*2048;
316 for( int i = -2*2048; i < 2*2048; i++ )
317 h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
320 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
321 for( int i = 0; i < 17; i++ )
322 cost_i4x4_mode[i] = 3*lambda*(i!=8);
328 void x264_analyse_free_costs( x264_t *h )
330 for( int i = 0; i < QP_MAX+1; i++ )
333 x264_free( h->cost_mv[i] - 2*4*2048 );
334 if( h->cost_mv_fpel[i][0] )
335 for( int j = 0; j < 4; j++ )
336 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
340 void x264_analyse_weight_frame( x264_t *h, int end )
342 for( int j = 0; j < h->i_ref[0]; j++ )
344 if( h->sh.weight[j][0].weightfn )
346 x264_frame_t *frame = h->fref[0][j];
347 int width = frame->i_width[0] + 2*PADH;
348 int i_padv = PADV << PARAM_INTERLACED;
350 pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
351 height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
352 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
353 h->fenc->i_lines_weighted += height;
355 for( int k = j; k < h->i_ref[0]; k++ )
356 if( h->sh.weight[k][0].weightfn )
358 pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
359 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
360 src + offset, frame->i_stride[0],
361 width, height, &h->sh.weight[k][0] );
368 /* initialize an array of lambda*nbits for all possible mvs */
369 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
371 a->p_cost_mv = h->cost_mv[a->i_qp];
372 a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
373 a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
376 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
378 int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
379 a->i_lambda = x264_lambda_tab[qp];
380 a->i_lambda2 = x264_lambda2_tab[qp];
382 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
383 if( h->param.analyse.i_trellis )
385 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
386 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
387 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
388 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
390 h->mb.i_psy_rd_lambda = a->i_lambda;
391 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
392 int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
393 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
395 if( qp > QP_MAX_SPEC )
397 h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
398 h->nr_residual_sum = h->nr_residual_sum_buf[1];
399 h->nr_count = h->nr_count_buf[1];
400 h->mb.b_noise_reduction = 1;
401 qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
405 h->nr_offset = h->nr_offset_denoise;
406 h->nr_residual_sum = h->nr_residual_sum_buf[0];
407 h->nr_count = h->nr_count_buf[0];
408 h->mb.b_noise_reduction = 0;
411 a->i_qp = h->mb.i_qp = qp;
412 h->mb.i_chroma_qp = h->chroma_qp_table[qp];
415 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
417 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
419 /* mbrd == 1 -> RD mode decision */
420 /* mbrd == 2 -> RD refinement */
421 /* mbrd == 3 -> QPRD */
422 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
423 h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
424 a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
426 x264_mb_analyse_init_qp( h, a, qp );
428 h->mb.b_transform_8x8 = 0;
434 a->i_satd_chroma = COST_MAX;
436 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
437 * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
438 uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
439 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
442 a->b_avoid_topright = 0;
444 h->mb.b_lossless ? 0 :
446 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
448 /* II: Inter part P/B frame */
449 if( h->sh.i_type != SLICE_TYPE_I )
451 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
452 // limit motion search to a slightly smaller range than the theoretical limit,
453 // since the search may go a few iterations past its given range
454 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
456 /* Calculate max allowed MV range */
457 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
458 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
459 h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
460 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
461 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
462 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
464 int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
465 int max_mv = max_x - 4*16*h->mb.i_mb_x;
466 /* If we're left of the refresh bar, don't reference right of it. */
467 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
468 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
470 h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
471 h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
472 if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
474 int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
475 int thread_mvy_range = i_fmv_range;
477 if( h->i_thread_frames > 1 )
479 int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
480 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
481 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
482 for( int j = 0; j < h->i_ref[i]; j++ )
484 x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
485 thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
488 if( h->param.b_deterministic )
489 thread_mvy_range = h->param.analyse.i_mv_range_thread;
490 if( PARAM_INTERLACED )
491 thread_mvy_range >>= 1;
493 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
496 if( PARAM_INTERLACED )
498 /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
499 for( int i = 0; i < 3; i++ )
502 mb_y = (h->mb.i_mb_y >> j) + (i == 1);
503 h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
504 h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
505 h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
506 h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
507 h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
508 h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
509 h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
514 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
515 h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
516 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
517 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
518 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
519 h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
520 h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
523 if( PARAM_INTERLACED )
525 int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
526 h->mb.mv_min[1] = h->mb.mv_miny_row[i];
527 h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
528 h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
529 h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
530 h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
531 h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
539 a->l0.i_cost8x16 = COST_MAX;
540 if( h->sh.i_type == SLICE_TYPE_B )
545 a->i_cost8x8direct[0] =
546 a->i_cost8x8direct[1] =
547 a->i_cost8x8direct[2] =
548 a->i_cost8x8direct[3] =
557 a->i_cost16x16direct =
560 a->i_cost8x16bi = COST_MAX;
562 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
563 for( int i = 0; i < 4; i++ )
567 a->l0.i_cost4x8[i] = COST_MAX;
570 /* Fast intra decision */
571 if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
573 /* Always run in fast-intra mode for subme < 3 */
574 if( h->mb.i_subpel_refine > 2 &&
575 ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
576 IS_INTRA( h->mb.i_mb_type_top ) ||
577 IS_INTRA( h->mb.i_mb_type_topleft ) ||
578 IS_INTRA( h->mb.i_mb_type_topright ) ||
579 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
580 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
581 { /* intra is likely */ }
588 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
589 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
591 a->b_force_intra = 1;
593 a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
596 a->b_force_intra = 0;
600 /* Prediction modes allowed for various combinations of neighbors. */
601 /* Terminated by a -1. */
602 /* In order, no neighbors, left, top, top/left, top/left/topleft */
603 static const int8_t i16x16_mode_available[5][5] =
605 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
606 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
607 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
608 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
609 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
612 static const int8_t chroma_mode_available[5][5] =
614 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
615 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
616 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
617 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
618 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
621 static const int8_t i4x4_mode_available[2][5][10] =
624 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
625 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
626 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
627 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
628 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
631 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
632 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
633 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
634 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
635 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1},
639 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
641 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
642 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
643 return i16x16_mode_available[idx];
646 static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
648 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
649 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
650 return chroma_mode_available[idx];
653 static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
655 int avoid_topright = force_intra && (i&1);
656 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
657 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
658 return i4x4_mode_available[avoid_topright][idx];
661 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
663 int avoid_topright = force_intra && ((i&5) == 5);
664 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
665 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
666 return i4x4_mode_available[avoid_topright][idx];
669 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
670 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
672 ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
674 if( do_both_dct || h->mb.b_transform_8x8 )
675 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
676 if( do_both_dct || !h->mb.b_transform_8x8 )
677 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
680 /* Reset fenc satd scores cache for psy RD */
681 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
683 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
684 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
685 if( !h->mb.i_psy_rd )
687 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
688 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
690 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
693 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
695 if( a->i_satd_chroma < COST_MAX )
700 if( !h->mb.b_chroma_me )
702 a->i_satd_chroma = 0;
706 /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
707 if( h->mb.b_lossless )
709 x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
710 x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
714 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
715 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
717 a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
718 + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
722 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
723 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
725 /* Prediction selection for chroma */
726 if( predict_mode[3] >= 0 && !h->mb.b_lossless )
728 int satdu[4], satdv[4];
729 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
730 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
731 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
732 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
733 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
734 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
736 for( ; *predict_mode >= 0; predict_mode++ )
738 int i_mode = *predict_mode;
739 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
741 a->i_satd_chroma_dir[i_mode] = i_satd;
742 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
747 for( ; *predict_mode >= 0; predict_mode++ )
750 int i_mode = *predict_mode;
752 /* we do the prediction */
753 if( h->mb.b_lossless )
754 x264_predict_lossless_chroma( h, i_mode );
757 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
758 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
761 /* we calculate the cost */
762 i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
763 h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
764 a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
766 a->i_satd_chroma_dir[i_mode] = i_satd;
767 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
771 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
774 /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
775 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
777 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
778 pixel *p_src = h->mb.pic.p_fenc[0];
779 pixel *p_dst = h->mb.pic.p_fdec[0];
780 static const int8_t intra_analysis_shortcut[2][2][2][5] =
782 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
783 {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
784 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
785 {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
786 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
787 {-1, -1, -1, -1, -1}},
788 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
789 {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
793 int lambda = a->i_lambda;
795 /*---------------- Try all mode and calculate their score ---------------*/
797 /* 16x16 prediction selection */
798 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
800 /* Not heavily tuned */
801 static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
802 int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
804 if( !h->mb.b_lossless && predict_mode[3] >= 0 )
806 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
807 a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
808 a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
809 a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
810 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
811 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
812 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
814 /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
815 if( a->i_satd_i16x16 <= i16x16_thresh )
817 h->predict_16x16[I_PRED_16x16_P]( p_dst );
818 a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
819 a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
820 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
825 for( ; *predict_mode >= 0; predict_mode++ )
828 int i_mode = *predict_mode;
830 if( h->mb.b_lossless )
831 x264_predict_lossless_16x16( h, 0, i_mode );
833 h->predict_16x16[i_mode]( p_dst );
835 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
836 lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
837 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
838 a->i_satd_i16x16_dir[i_mode] = i_satd;
842 if( h->sh.i_type == SLICE_TYPE_B )
843 /* cavlc mb type prefix */
844 a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
846 if( a->i_satd_i16x16 > i16x16_thresh )
849 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
850 /* 8x8 prediction selection */
851 if( flags & X264_ANALYSE_I8x8 )
853 ALIGNED_ARRAY_32( pixel, edge,[36] );
854 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
855 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
857 // FIXME some bias like in i4x4?
858 int i_cost = lambda * 4; /* base predmode costs */
859 h->mb.i_cbp_luma = 0;
861 if( h->sh.i_type == SLICE_TYPE_B )
862 i_cost += lambda * i_mb_b_cost_table[I_8x8];
864 for( idx = 0;; idx++ )
868 pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
869 pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
870 int i_best = COST_MAX;
871 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
873 predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
874 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
876 if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
878 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
879 i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
880 i_cost += i_best & 0xffff;
882 a->i_predict8x8[idx] = i_best;
883 if( idx == 3 || i_cost > i_satd_thresh )
885 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
889 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
892 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
893 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
894 satd[i_pred_mode] -= 3 * lambda;
895 for( int i = 2; i >= 0; i-- )
898 a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
899 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
902 /* Take analysis shortcuts: don't analyse modes that are too
903 * far away direction-wise from the favored mode. */
904 if( a->i_mbrd < 1 + a->b_fast_intra )
905 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
910 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
913 int i_mode = *predict_mode;
915 if( h->mb.b_lossless )
916 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
918 h->predict_8x8[i_mode]( p_dst_by, edge );
920 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
921 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
922 i_satd -= 3 * lambda;
924 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
925 a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
927 i_cost += i_best + 3*lambda;
929 if( idx == 3 || i_cost > i_satd_thresh )
931 if( h->mb.b_lossless )
932 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
934 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
935 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
937 /* we need to encode this block now (for next ones) */
938 x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
943 a->i_satd_i8x8 = i_cost;
944 if( h->mb.i_skip_intra )
946 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
947 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
948 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
949 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
950 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
951 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
952 if( h->mb.i_skip_intra == 2 )
953 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
958 static const uint16_t cost_div_fix8[3] = {1024,512,341};
959 a->i_satd_i8x8 = COST_MAX;
960 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
962 /* Not heavily tuned */
963 static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
964 if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
968 /* 4x4 prediction selection */
969 if( flags & X264_ANALYSE_I4x4 )
971 int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
972 int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
973 h->mb.i_cbp_luma = 0;
975 if( a->b_early_terminate && a->i_mbrd )
976 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
978 if( h->sh.i_type == SLICE_TYPE_B )
979 i_cost += lambda * i_mb_b_cost_table[I_4x4];
981 for( idx = 0;; idx++ )
983 pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
984 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
985 int i_best = COST_MAX;
986 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
988 predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
990 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
991 /* emulate missing topright samples */
992 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
994 if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
996 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
997 i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
998 i_cost += i_best & 0xffff;
1000 a->i_predict4x4[idx] = i_best;
1001 if( i_cost > i_satd_thresh || idx == 15 )
1003 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
1007 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
1010 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
1011 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
1012 satd[i_pred_mode] -= 3 * lambda;
1013 i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
1014 COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
1015 COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
1017 /* Take analysis shortcuts: don't analyse modes that are too
1018 * far away direction-wise from the favored mode. */
1019 if( a->i_mbrd < 1 + a->b_fast_intra )
1020 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
1027 for( ; *predict_mode >= 0; predict_mode++ )
1030 int i_mode = *predict_mode;
1032 if( h->mb.b_lossless )
1033 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
1035 h->predict_4x4[i_mode]( p_dst_by );
1037 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
1038 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
1040 i_satd -= lambda * 3;
1044 a->i_predict4x4[idx] = i_mode;
1049 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
1053 i_cost += i_best + 3 * lambda;
1054 if( i_cost > i_satd_thresh || idx == 15 )
1056 if( h->mb.b_lossless )
1057 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
1059 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
1060 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1062 /* we need to encode this block now (for next ones) */
1063 x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
1067 a->i_satd_i4x4 = i_cost;
1068 if( h->mb.i_skip_intra )
1070 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1071 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1072 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1073 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1074 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1075 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1076 if( h->mb.i_skip_intra == 2 )
1077 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1081 a->i_satd_i4x4 = COST_MAX;
1085 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1087 if( !a->b_early_terminate )
1088 i_satd_thresh = COST_MAX;
1090 if( a->i_satd_i16x16 < i_satd_thresh )
1092 h->mb.i_type = I_16x16;
1093 x264_analyse_update_cache( h, a );
1094 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1097 a->i_satd_i16x16 = COST_MAX;
1099 if( a->i_satd_i4x4 < i_satd_thresh )
1101 h->mb.i_type = I_4x4;
1102 x264_analyse_update_cache( h, a );
1103 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1106 a->i_satd_i4x4 = COST_MAX;
1108 if( a->i_satd_i8x8 < i_satd_thresh )
1110 h->mb.i_type = I_8x8;
1111 x264_analyse_update_cache( h, a );
1112 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1113 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1116 a->i_satd_i8x8 = COST_MAX;
1119 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1121 uint64_t i_satd, i_best;
1122 int plane_count = CHROMA444 ? 3 : 1;
1123 h->mb.i_skip_intra = 0;
1125 if( h->mb.i_type == I_16x16 )
1127 int old_pred_mode = a->i_predict16x16;
1128 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1129 int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
1130 i_best = a->i_satd_i16x16;
1131 for( ; *predict_mode >= 0; predict_mode++ )
1133 int i_mode = *predict_mode;
1134 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1136 h->mb.i_intra16x16_pred_mode = i_mode;
1137 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1138 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1142 /* RD selection for chroma prediction */
1145 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
1146 if( predict_mode[1] >= 0 )
1148 int8_t predict_mode_sorted[4];
1150 int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
1152 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1154 int i_mode = *predict_mode;
1155 if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1156 predict_mode_sorted[i_max++] = i_mode;
1161 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1162 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1163 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1164 * coefs for the current chroma mode are still around, so we only
1165 * have to recount the bits. */
1166 i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1167 for( int i = 0; i < i_max; i++ )
1169 int i_mode = predict_mode_sorted[i];
1170 if( h->mb.b_lossless )
1171 x264_predict_lossless_chroma( h, i_mode );
1174 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
1175 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
1177 /* if we've already found a mode that needs no residual, then
1178 * probably any mode with a residual will be worse.
1179 * so avoid dct on the remaining modes to improve speed. */
1180 i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1181 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1183 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1184 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1189 if( h->mb.i_type == I_4x4 )
1191 pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
1193 for( int idx = 0; idx < 16; idx++ )
1195 pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
1196 h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
1197 h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
1198 i_best = COST_MAX64;
1200 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1202 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1203 for( int p = 0; p < plane_count; p++ )
1204 /* emulate missing topright samples */
1205 MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
1207 for( ; *predict_mode >= 0; predict_mode++ )
1209 int i_mode = *predict_mode;
1210 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1212 if( i_best > i_satd )
1214 a->i_predict4x4[idx] = i_mode;
1216 for( int p = 0; p < plane_count; p++ )
1218 pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
1219 pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
1220 pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
1221 pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
1222 nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
1227 for( int p = 0; p < plane_count; p++ )
1229 MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
1230 MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
1231 MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
1232 MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
1233 h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
1236 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1239 else if( h->mb.i_type == I_8x8 )
1241 ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
1242 pixel4 pels_h[3][2] = {{0}};
1243 pixel pels_v[3][7] = {{0}};
1244 uint16_t nnz[3][2] = {{0}}; //shut up gcc
1245 for( int idx = 0; idx < 4; idx++ )
1249 int s8 = X264_SCAN8_0 + 2*x + 16*y;
1250 pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
1251 h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
1252 h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
1253 int cbp_luma_new = 0;
1254 int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
1256 i_best = COST_MAX64;
1258 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1259 for( int p = 0; p < plane_count; p++ )
1260 h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1262 for( ; *predict_mode >= 0; predict_mode++ )
1264 int i_mode = *predict_mode;
1265 if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
1268 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1269 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
1271 if( i_best > i_satd )
1273 a->i_predict8x8[idx] = i_mode;
1274 cbp_luma_new = h->mb.i_cbp_luma;
1277 for( int p = 0; p < plane_count; p++ )
1279 pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
1280 pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
1282 for( int j = 0; j < 7; j++ )
1283 pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
1284 nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
1285 nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
1289 a->i_cbp_i8x8_luma = cbp_luma_new;
1290 for( int p = 0; p < plane_count; p++ )
1292 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
1293 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
1295 for( int j = 0; j < 7; j++ )
1296 dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
1297 M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
1298 M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
1301 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1306 #define LOAD_FENC(m, src, xoff, yoff) \
1308 (m)->p_cost_mv = a->p_cost_mv; \
1309 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1310 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1311 (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
1312 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1313 (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1314 (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1317 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1319 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1320 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1321 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1322 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1325 (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
1326 (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
1327 (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
1328 (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
1329 (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
1330 (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
1331 (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
1332 (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
1335 (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
1336 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1337 (m)->weight = x264_weight_none; \
1341 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1342 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1343 (m)->weight = h->sh.weight[i_ref];
1345 #define REF_COST(list, ref) \
1346 (a->p_cost_ref[list][ref])
1348 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1352 ALIGNED_4( int16_t mvc[8][2] );
1353 int i_halfpel_thresh = INT_MAX;
1354 int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
1356 /* 16x16 Search on all ref frame */
1357 m.i_pixel = PIXEL_16x16;
1358 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1360 a->l0.me16x16.cost = INT_MAX;
1361 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1363 m.i_ref_cost = REF_COST( 0, i_ref );
1364 i_halfpel_thresh -= m.i_ref_cost;
1366 /* search with ref */
1367 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1368 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1370 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1372 if( h->mb.ref_blind_dupe == i_ref )
1374 CP32( m.mv, a->l0.mvc[0][0] );
1375 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1379 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1380 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1383 /* save mv for predicting neighbors */
1384 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1385 CP32( a->l0.mvc[i_ref][0], m.mv );
1387 /* early termination
1388 * SSD threshold would probably be better than SATD */
1391 && m.cost-m.cost_mv < 300*a->i_lambda
1392 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1393 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1394 && x264_macroblock_probe_pskip( h ) )
1396 h->mb.i_type = P_SKIP;
1397 x264_analyse_update_cache( h, a );
1398 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1402 m.cost += m.i_ref_cost;
1403 i_halfpel_thresh += m.i_ref_cost;
1405 if( m.cost < a->l0.me16x16.cost )
1406 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1409 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1410 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1412 h->mb.i_type = P_L0;
1415 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1416 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1418 h->mb.i_partition = D_16x16;
1419 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1420 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1421 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1422 h->mb.i_type = P_SKIP;
1427 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1430 pixel **p_fenc = h->mb.pic.p_fenc;
1431 int i_maxref = h->mb.pic.i_fref[0]-1;
1433 h->mb.i_partition = D_8x8;
1435 #define CHECK_NEIGHBOUR(i)\
1437 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1438 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1442 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1443 * than those used by the neighbors */
1444 if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1445 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
1448 CHECK_NEIGHBOUR( -8 - 1 );
1449 CHECK_NEIGHBOUR( -8 + 0 );
1450 CHECK_NEIGHBOUR( -8 + 2 );
1451 CHECK_NEIGHBOUR( -8 + 4 );
1452 CHECK_NEIGHBOUR( 0 - 1 );
1453 CHECK_NEIGHBOUR( 2*8 - 1 );
1455 #undef CHECK_NEIGHBOUR
1457 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1458 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1460 for( int i = 0; i < 4; i++ )
1462 x264_me_t *l0m = &a->l0.me8x8[i];
1466 m.i_pixel = PIXEL_8x8;
1468 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1469 l0m->cost = INT_MAX;
1470 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1472 m.i_ref_cost = REF_COST( 0, i_ref );
1474 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1475 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1477 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1478 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1479 if( h->mb.ref_blind_dupe == i_ref )
1481 CP32( m.mv, a->l0.mvc[0][i+1] );
1482 x264_me_refine_qpel_refdupe( h, &m, NULL );
1485 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1487 m.cost += m.i_ref_cost;
1489 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1491 if( m.cost < l0m->cost )
1492 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1493 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1494 i_ref = h->mb.ref_blind_dupe;
1498 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1499 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1501 a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1503 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1504 are effectively zero. */
1505 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1506 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1509 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1510 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1511 /* P_8x8 ref0 has no ref cost */
1512 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1513 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1514 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1515 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1516 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1519 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1521 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1522 * reference frame flags. Thus, if we're not doing mixedrefs, just
1523 * don't bother analysing the dupes. */
1524 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1525 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1526 pixel **p_fenc = h->mb.pic.p_fenc;
1528 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1530 /* XXX Needed for x264_mb_predict_mv */
1531 h->mb.i_partition = D_8x8;
1534 CP32( mvc[0], a->l0.me16x16.mv );
1536 for( int i = 0; i < 4; i++ )
1538 x264_me_t *m = &a->l0.me8x8[i];
1542 m->i_pixel = PIXEL_8x8;
1543 m->i_ref_cost = i_ref_cost;
1545 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1546 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1547 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1549 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1550 x264_me_search( h, m, mvc, i_mvc );
1552 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1554 CP32( mvc[i_mvc], m->mv );
1557 a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1560 m->cost += i_ref_cost;
1561 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1562 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1565 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1566 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1567 /* theoretically this should include 4*ref_cost,
1568 * but 3 seems a better approximation of cabac. */
1569 if( h->param.b_cabac )
1570 a->l0.i_cost8x8 -= i_ref_cost;
1571 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1572 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1575 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1578 pixel **p_fenc = h->mb.pic.p_fenc;
1579 ALIGNED_4( int16_t mvc[3][2] );
1581 /* XXX Needed for x264_mb_predict_mv */
1582 h->mb.i_partition = D_16x8;
1584 for( int i = 0; i < 2; i++ )
1586 x264_me_t *l0m = &a->l0.me16x8[i];
1587 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1588 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1589 const int ref8[2] = { minref, maxref };
1590 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1592 m.i_pixel = PIXEL_16x8;
1594 LOAD_FENC( &m, p_fenc, 0, 8*i );
1595 l0m->cost = INT_MAX;
1596 for( int j = 0; j < i_ref8s; j++ )
1598 const int i_ref = ref8[j];
1599 m.i_ref_cost = REF_COST( 0, i_ref );
1601 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1602 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1603 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1604 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1606 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1607 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1609 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1610 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1611 /* We can only take this shortcut if the first search was performed on ref0. */
1612 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1614 /* We can just leave the MV from the previous ref search. */
1615 x264_me_refine_qpel_refdupe( h, &m, NULL );
1618 x264_me_search( h, &m, mvc, 3 );
1620 m.cost += m.i_ref_cost;
1622 if( m.cost < l0m->cost )
1623 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1626 /* Early termination based on the current SATD score of partition[0]
1627 plus the estimated SATD score of partition[1] */
1628 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1630 a->l0.i_cost16x8 = COST_MAX;
1634 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1635 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1638 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1641 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1644 pixel **p_fenc = h->mb.pic.p_fenc;
1645 ALIGNED_4( int16_t mvc[3][2] );
1647 /* XXX Needed for x264_mb_predict_mv */
1648 h->mb.i_partition = D_8x16;
1650 for( int i = 0; i < 2; i++ )
1652 x264_me_t *l0m = &a->l0.me8x16[i];
1653 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1654 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1655 const int ref8[2] = { minref, maxref };
1656 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1658 m.i_pixel = PIXEL_8x16;
1660 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1661 l0m->cost = INT_MAX;
1662 for( int j = 0; j < i_ref8s; j++ )
1664 const int i_ref = ref8[j];
1665 m.i_ref_cost = REF_COST( 0, i_ref );
1667 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1668 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1669 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1671 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1672 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1674 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1675 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1676 /* We can only take this shortcut if the first search was performed on ref0. */
1677 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1679 /* We can just leave the MV from the previous ref search. */
1680 x264_me_refine_qpel_refdupe( h, &m, NULL );
1683 x264_me_search( h, &m, mvc, 3 );
1685 m.cost += m.i_ref_cost;
1687 if( m.cost < l0m->cost )
1688 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1691 /* Early termination based on the current SATD score of partition[0]
1692 plus the estimated SATD score of partition[1] */
1693 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1695 a->l0.i_cost8x16 = COST_MAX;
1699 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1700 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1703 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1706 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
1707 pixel **p_fref, int i8x8, int size, int chroma )
1709 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1710 pixel *pix2 = pix1+8;
1711 int i_stride = h->mb.pic.i_stride[1];
1712 int chroma_h_shift = chroma <= CHROMA_422;
1713 int chroma_v_shift = chroma == CHROMA_420;
1714 int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
1715 int i_ref = a->l0.me8x8[i8x8].i_ref;
1716 int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1717 x264_weight_t *weight = h->sh.weight[i_ref];
1719 // FIXME weight can be done on 4x4 blocks even if mc is smaller
1720 #define CHROMA4x4MC( width, height, me, x, y ) \
1721 if( chroma == CHROMA_444 ) \
1723 int mvx = (me).mv[0] + 4*2*x; \
1724 int mvy = (me).mv[1] + 4*2*y; \
1725 h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
1726 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
1727 h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
1728 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
1732 int offset = x + (2>>chroma_v_shift)*16*y; \
1733 int chroma_height = (2>>chroma_v_shift)*height; \
1734 h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
1735 (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
1736 if( weight[1].weightfn ) \
1737 weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
1738 if( weight[2].weightfn ) \
1739 weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
1742 if( size == PIXEL_4x4 )
1744 x264_me_t *m = a->l0.me4x4[i8x8];
1745 CHROMA4x4MC( 2,2, m[0], 0,0 );
1746 CHROMA4x4MC( 2,2, m[1], 2,0 );
1747 CHROMA4x4MC( 2,2, m[2], 0,2 );
1748 CHROMA4x4MC( 2,2, m[3], 2,2 );
1750 else if( size == PIXEL_8x4 )
1752 x264_me_t *m = a->l0.me8x4[i8x8];
1753 CHROMA4x4MC( 4,2, m[0], 0,0 );
1754 CHROMA4x4MC( 4,2, m[1], 0,2 );
1758 x264_me_t *m = a->l0.me4x8[i8x8];
1759 CHROMA4x4MC( 2,4, m[0], 0,0 );
1760 CHROMA4x4MC( 2,4, m[1], 2,0 );
1764 int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
1765 int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
1766 return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1767 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1770 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1772 if( CHROMA_FORMAT == CHROMA_444 )
1773 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
1774 else if( CHROMA_FORMAT == CHROMA_422 )
1775 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
1777 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
1780 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1782 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1783 pixel **p_fenc = h->mb.pic.p_fenc;
1784 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1786 /* XXX Needed for x264_mb_predict_mv */
1787 h->mb.i_partition = D_8x8;
1789 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1791 const int idx = 4*i8x8 + i4x4;
1792 const int x4 = block_idx_x[idx];
1793 const int y4 = block_idx_y[idx];
1794 const int i_mvc = (i4x4 == 0);
1796 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1798 m->i_pixel = PIXEL_4x4;
1800 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1801 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1802 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1804 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1805 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1807 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1809 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1810 a->l0.me4x4[i8x8][1].cost +
1811 a->l0.me4x4[i8x8][2].cost +
1812 a->l0.me4x4[i8x8][3].cost +
1813 REF_COST( 0, i_ref ) +
1814 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1815 if( h->mb.b_chroma_me )
1816 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1819 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1821 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1822 pixel **p_fenc = h->mb.pic.p_fenc;
1823 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1825 /* XXX Needed for x264_mb_predict_mv */
1826 h->mb.i_partition = D_8x8;
1828 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1830 const int idx = 4*i8x8 + 2*i8x4;
1831 const int x4 = block_idx_x[idx];
1832 const int y4 = block_idx_y[idx];
1833 const int i_mvc = (i8x4 == 0);
1835 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1837 m->i_pixel = PIXEL_8x4;
1839 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1840 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1841 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1843 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1844 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1846 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1848 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1849 REF_COST( 0, i_ref ) +
1850 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1851 if( h->mb.b_chroma_me )
1852 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1855 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1857 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1858 pixel **p_fenc = h->mb.pic.p_fenc;
1859 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1861 /* XXX Needed for x264_mb_predict_mv */
1862 h->mb.i_partition = D_8x8;
1864 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1866 const int idx = 4*i8x8 + i4x8;
1867 const int x4 = block_idx_x[idx];
1868 const int y4 = block_idx_y[idx];
1869 const int i_mvc = (i4x8 == 0);
1871 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1873 m->i_pixel = PIXEL_4x8;
1875 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1876 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1877 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1879 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1880 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1882 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1884 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1885 REF_COST( 0, i_ref ) +
1886 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1887 if( h->mb.b_chroma_me )
1888 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1891 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1893 ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
1894 ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
1895 int i_chroma_cost = 0;
1896 int chromapix = h->luma2chroma_pixel[i_pixel];
1898 #define COST_BI_CHROMA( m0, m1, width, height ) \
1902 h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
1903 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1904 h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
1905 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1906 h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
1907 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1908 h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
1909 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1913 int v_shift = CHROMA_V_SHIFT; \
1914 int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1915 int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1916 h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
1917 m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1918 h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
1919 m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1921 h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1922 h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1923 i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
1924 + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
1927 if( i_pixel == PIXEL_16x16 )
1928 COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
1929 else if( i_pixel == PIXEL_16x8 )
1930 COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
1931 else if( i_pixel == PIXEL_8x16 )
1932 COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
1934 COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
1936 return i_chroma_cost;
1939 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1941 /* Assumes that fdec still contains the results of
1942 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1944 pixel *p_fenc = h->mb.pic.p_fenc[0];
1945 pixel *p_fdec = h->mb.pic.p_fdec[0];
1947 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1948 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1950 int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
1952 for( int i = 0; i < 4; i++ )
1954 const int x = (i&1)*8;
1955 const int y = (i>>1)*8;
1956 a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1957 &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1958 if( h->mb.b_chroma_me )
1960 int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
1961 int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
1962 a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
1963 &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
1964 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
1965 &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
1967 a->i_cost16x16direct += a->i_cost8x8direct[i];
1970 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1975 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
1976 if( h->mb.b_chroma_me )
1978 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
1979 a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
1980 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
1985 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1987 ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
1988 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1990 intptr_t stride0 = 16, stride1 = 16;
1992 ALIGNED_4( int16_t mvc[9][2] );
1993 int try_skip = a->b_try_skip;
1994 int list1_skipped = 0;
1995 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1996 int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
1997 (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
2000 m.i_pixel = PIXEL_16x16;
2002 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
2004 /* 16x16 Search on list 0 and list 1 */
2005 a->l0.me16x16.cost = INT_MAX;
2006 a->l1.me16x16.cost = INT_MAX;
2007 for( int l = 1; l >= 0; )
2009 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2011 /* This loop is extremely munged in order to facilitate the following order of operations,
2012 * necessary for an efficient fast skip.
2013 * 1. Search list1 ref0.
2014 * 2. Search list0 ref0.
2016 * 4. Search the rest of list0.
2017 * 5. Go back and finish list1.
2019 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
2021 if( try_skip && l == 1 && i_ref > 0 )
2027 m.i_ref_cost = REF_COST( l, i_ref );
2029 /* search with ref */
2030 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
2031 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
2032 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
2033 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
2036 m.cost += m.i_ref_cost;
2038 if( m.cost < lX->me16x16.cost )
2039 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
2041 /* save mv for predicting neighbors */
2042 CP32( lX->mvc[i_ref][0], m.mv );
2043 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
2045 /* Fast skip detection. */
2046 if( i_ref == 0 && try_skip )
2048 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
2049 abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
2055 /* We already tested skip */
2056 h->mb.i_type = B_SKIP;
2057 x264_analyse_update_cache( h, a );
2062 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
2064 if( list1_skipped && l == 0 )
2070 /* get cost of BI mode */
2071 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
2072 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
2073 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
2074 src0 = h->mc.get_ref( pix0, &stride0,
2075 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
2076 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
2077 src1 = h->mc.get_ref( pix1, &stride1,
2078 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
2079 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
2081 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2083 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2085 + a->l0.bi16x16.cost_mv
2086 + a->l1.bi16x16.cost_mv;
2088 if( h->mb.b_chroma_me )
2089 a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
2091 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
2092 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
2094 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
2095 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
2096 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
2097 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
2098 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2099 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2100 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2101 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2102 + ref_costs + l0_mv_cost + l1_mv_cost;
2104 if( h->mb.b_chroma_me )
2106 ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
2110 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2111 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2112 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2113 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
2114 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2115 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2116 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2117 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
2121 ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
2122 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2123 int v_shift = CHROMA_V_SHIFT;
2125 if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
2127 int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2128 h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2129 h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
2132 h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2133 h->mb.pic.i_stride[1], 16>>v_shift );
2135 if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
2137 int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2138 h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2139 h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
2142 h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2143 h->mb.pic.i_stride[1], 16>>v_shift );
2145 h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
2146 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2147 h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
2148 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2150 cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
2151 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
2155 if( cost00 < a->i_cost16x16bi )
2157 M32( a->l0.bi16x16.mv ) = 0;
2158 M32( a->l1.bi16x16.mv ) = 0;
2159 a->l0.bi16x16.cost_mv = l0_mv_cost;
2160 a->l1.bi16x16.cost_mv = l1_mv_cost;
2161 a->i_cost16x16bi = cost00;
2166 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
2167 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
2168 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2171 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2176 switch( h->mb.i_sub_partition[i] )
2179 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2182 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2183 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2186 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2187 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2190 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2191 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2192 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2193 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2196 x264_log( h, X264_LOG_ERROR, "internal error\n" );
2201 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2205 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2206 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2207 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2208 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2211 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2212 if( x264_mb_partition_listX_table[0][part] ) \
2214 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2215 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2219 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2220 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
2222 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2224 if( x264_mb_partition_listX_table[1][part] ) \
2226 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2227 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2231 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2232 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
2234 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2237 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2241 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2243 x264_mb_load_mv_direct8x8( h, i );
2246 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
2247 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
2248 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2253 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2256 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2258 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2260 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2262 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2266 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2268 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2269 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2271 /* early termination: if 16x16 chose ref 0, then evalute no refs older
2272 * than those used by the neighbors */
2273 #define CHECK_NEIGHBOUR(i)\
2275 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2276 if( ref > i_maxref[l] )\
2280 for( int l = 0; l < 2; l++ )
2282 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2283 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2284 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2287 CHECK_NEIGHBOUR( -8 - 1 );
2288 CHECK_NEIGHBOUR( -8 + 0 );
2289 CHECK_NEIGHBOUR( -8 + 2 );
2290 CHECK_NEIGHBOUR( -8 + 4 );
2291 CHECK_NEIGHBOUR( 0 - 1 );
2292 CHECK_NEIGHBOUR( 2*8 - 1 );
2296 /* XXX Needed for x264_mb_predict_mv */
2297 h->mb.i_partition = D_8x8;
2301 for( int i = 0; i < 4; i++ )
2307 intptr_t stride[2] = {8,8};
2310 m.i_pixel = PIXEL_8x8;
2311 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2313 for( int l = 0; l < 2; l++ )
2315 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2317 lX->me8x8[i].cost = INT_MAX;
2318 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2320 m.i_ref_cost = REF_COST( l, i_ref );
2322 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2324 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2325 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2326 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2327 m.cost += m.i_ref_cost;
2329 if( m.cost < lX->me8x8[i].cost )
2331 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2332 a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2335 /* save mv for predicting other partitions within this MB */
2336 CP32( lX->mvc[i_ref][i+1], m.mv );
2341 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2342 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
2343 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2344 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
2345 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2346 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2348 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2349 i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2350 + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2351 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2353 if( h->mb.b_chroma_me )
2355 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2356 i_part_cost_bi += i_chroma_cost;
2357 a->i_satd8x8[2][i] += i_chroma_cost;
2360 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2361 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2363 i_part_cost = a->l0.me8x8[i].cost;
2364 h->mb.i_sub_partition[i] = D_L0_8x8;
2365 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2366 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2367 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2368 a->i_cost8x8bi += i_part_cost;
2370 /* XXX Needed for x264_mb_predict_mv */
2371 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2375 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2378 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2381 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2382 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2383 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2385 /* XXX Needed for x264_mb_predict_mv */
2386 h->mb.i_partition = D_8x8;
2390 for( int i = 0; i < 4; i++ )
2395 int i_part_cost_bi = 0;
2396 intptr_t stride[2] = {8,8};
2399 for( int l = 0; l < 2; l++ )
2401 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2402 x264_me_t *m = &lX->me8x8[i];
2403 m->i_pixel = PIXEL_8x8;
2404 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2406 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2407 m->i_ref = lX->me16x16.i_ref;
2409 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2411 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2412 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2413 x264_me_search( h, m, &lX->me16x16.mv, 1 );
2414 a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2415 m->cost += m->i_ref_cost;
2417 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2419 /* save mv for predicting other partitions within this MB */
2420 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2423 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2424 m->mv[0], m->mv[1], 8, 8, x264_weight_none );
2425 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2427 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2428 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2429 i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2430 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2431 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2433 if( h->mb.b_chroma_me )
2435 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2436 i_part_cost_bi += i_chroma_cost;
2437 a->i_satd8x8[2][i] += i_chroma_cost;
2440 i_part_cost = a->l0.me8x8[i].cost;
2441 h->mb.i_sub_partition[i] = D_L0_8x8;
2442 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2443 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2444 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2445 a->i_cost8x8bi += i_part_cost;
2447 /* XXX Needed for x264_mb_predict_mv */
2448 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2452 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2455 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2457 ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
2458 ALIGNED_4( int16_t mvc[3][2] );
2460 h->mb.i_partition = D_16x8;
2461 a->i_cost16x8bi = 0;
2463 for( int i = 0; i < 2; i++ )
2466 int i_part_cost_bi = 0;
2467 intptr_t stride[2] = {16,16};
2470 m.i_pixel = PIXEL_16x8;
2471 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2473 for( int l = 0; l < 2; l++ )
2475 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2476 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2477 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2478 lX->me16x8[i].cost = INT_MAX;
2479 for( int j = 0; j < i_ref8s; j++ )
2481 int i_ref = ref8[j];
2482 m.i_ref_cost = REF_COST( l, i_ref );
2484 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2486 CP32( mvc[0], lX->mvc[i_ref][0] );
2487 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2488 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2490 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2491 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2492 x264_me_search( h, &m, mvc, 3 );
2493 m.cost += m.i_ref_cost;
2495 if( m.cost < lX->me16x8[i].cost )
2496 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2501 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2502 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
2503 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2504 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
2505 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2506 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2508 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2509 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2510 + a->l1.me16x8[i].i_ref_cost;
2512 if( h->mb.b_chroma_me )
2513 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2515 i_part_cost = a->l0.me16x8[i].cost;
2516 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2518 if( a->l1.me16x8[i].cost < i_part_cost )
2520 i_part_cost = a->l1.me16x8[i].cost;
2521 a->i_mb_partition16x8[i] = D_L1_8x8;
2523 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2525 i_part_cost = i_part_cost_bi;
2526 a->i_mb_partition16x8[i] = D_BI_8x8;
2528 a->i_cost16x8bi += i_part_cost;
2530 /* Early termination based on the current SATD score of partition[0]
2531 plus the estimated SATD score of partition[1] */
2532 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2533 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2535 a->i_cost16x8bi = COST_MAX;
2539 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2543 a->i_mb_type16x8 = B_L0_L0
2544 + (a->i_mb_partition16x8[0]>>2) * 3
2545 + (a->i_mb_partition16x8[1]>>2);
2546 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2549 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2551 ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2552 ALIGNED_4( int16_t mvc[3][2] );
2554 h->mb.i_partition = D_8x16;
2555 a->i_cost8x16bi = 0;
2557 for( int i = 0; i < 2; i++ )
2560 int i_part_cost_bi = 0;
2561 intptr_t stride[2] = {8,8};
2564 m.i_pixel = PIXEL_8x16;
2565 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2567 for( int l = 0; l < 2; l++ )
2569 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2570 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2571 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2572 lX->me8x16[i].cost = INT_MAX;
2573 for( int j = 0; j < i_ref8s; j++ )
2575 int i_ref = ref8[j];
2576 m.i_ref_cost = REF_COST( l, i_ref );
2578 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2580 CP32( mvc[0], lX->mvc[i_ref][0] );
2581 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2582 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2584 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2585 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2586 x264_me_search( h, &m, mvc, 3 );
2587 m.cost += m.i_ref_cost;
2589 if( m.cost < lX->me8x16[i].cost )
2590 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2595 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2596 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
2597 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2598 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
2599 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2601 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2602 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2603 + a->l1.me8x16[i].i_ref_cost;
2605 if( h->mb.b_chroma_me )
2606 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2608 i_part_cost = a->l0.me8x16[i].cost;
2609 a->i_mb_partition8x16[i] = D_L0_8x8;
2611 if( a->l1.me8x16[i].cost < i_part_cost )
2613 i_part_cost = a->l1.me8x16[i].cost;
2614 a->i_mb_partition8x16[i] = D_L1_8x8;
2616 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2618 i_part_cost = i_part_cost_bi;
2619 a->i_mb_partition8x16[i] = D_BI_8x8;
2621 a->i_cost8x16bi += i_part_cost;
2623 /* Early termination based on the current SATD score of partition[0]
2624 plus the estimated SATD score of partition[1] */
2625 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2626 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2628 a->i_cost8x16bi = COST_MAX;
2632 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2636 a->i_mb_type8x16 = B_L0_L0
2637 + (a->i_mb_partition8x16[0]>>2) * 3
2638 + (a->i_mb_partition8x16[1]>>2);
2639 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2642 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2644 int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
2646 h->mb.i_type = P_L0;
2647 if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
2649 h->mb.i_partition = D_16x16;
2650 x264_analyse_update_cache( h, a );
2651 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2654 if( a->l0.i_cost16x8 < thresh )
2656 h->mb.i_partition = D_16x8;
2657 x264_analyse_update_cache( h, a );
2658 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2661 a->l0.i_cost16x8 = COST_MAX;
2663 if( a->l0.i_cost8x16 < thresh )
2665 h->mb.i_partition = D_8x16;
2666 x264_analyse_update_cache( h, a );
2667 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2670 a->l0.i_cost8x16 = COST_MAX;
2672 if( a->l0.i_cost8x8 < thresh )
2674 h->mb.i_type = P_8x8;
2675 h->mb.i_partition = D_8x8;
2676 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2678 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2679 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2680 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2681 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2682 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2683 * for future blocks are those left over from previous RDO calls. */
2684 for( int i = 0; i < 4; i++ )
2686 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2687 int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
2688 int subtype, btype = D_L0_8x8;
2689 uint64_t bcost = COST_MAX64;
2690 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2693 if( costs[subtype] > sub8x8_thresh )
2695 h->mb.i_sub_partition[i] = subtype;
2696 x264_mb_cache_mv_p8x8( h, a, i );
2697 if( subtype == btype )
2699 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2700 COPY2_IF_LT( bcost, cost, btype, subtype );
2702 if( h->mb.i_sub_partition[i] != btype )
2704 h->mb.i_sub_partition[i] = btype;
2705 x264_mb_cache_mv_p8x8( h, a, i );
2710 x264_analyse_update_cache( h, a );
2711 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2714 a->l0.i_cost8x8 = COST_MAX;
2717 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2719 int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
2721 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2723 h->mb.i_type = B_DIRECT;
2724 /* Assumes direct/skip MC is still in fdec */
2725 /* Requires b-rdo to be done before intra analysis */
2726 h->mb.b_skip_mc = 1;
2727 x264_analyse_update_cache( h, a );
2728 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2729 h->mb.b_skip_mc = 0;
2732 //FIXME not all the update_cache calls are needed
2733 h->mb.i_partition = D_16x16;
2735 if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
2737 h->mb.i_type = B_L0_L0;
2738 x264_analyse_update_cache( h, a );
2739 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2743 if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
2745 h->mb.i_type = B_L1_L1;
2746 x264_analyse_update_cache( h, a );
2747 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2751 if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
2753 h->mb.i_type = B_BI_BI;
2754 x264_analyse_update_cache( h, a );
2755 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2759 if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
2761 h->mb.i_type = B_8x8;
2762 h->mb.i_partition = D_8x8;
2763 x264_analyse_update_cache( h, a );
2764 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2765 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2769 if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
2771 h->mb.i_type = a->i_mb_type16x8;
2772 h->mb.i_partition = D_16x8;
2773 x264_analyse_update_cache( h, a );
2774 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2778 if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
2780 h->mb.i_type = a->i_mb_type8x16;
2781 h->mb.i_partition = D_8x16;
2782 x264_analyse_update_cache( h, a );
2783 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2787 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2791 if( IS_INTRA(h->mb.i_type) )
2794 switch( h->mb.i_partition )
2797 if( h->mb.i_type == B_BI_BI )
2799 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2800 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2804 for( int i = 0; i < 2; i++ )
2805 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2807 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2808 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2812 for( int i = 0; i < 2; i++ )
2813 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2815 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2816 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2820 for( int i = 0; i < 4; i++ )
2821 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2823 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2824 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2830 static inline void x264_mb_analyse_transform( x264_t *h )
2832 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2834 /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
2837 int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
2838 int i_cost8 = 0, i_cost4 = 0;
2839 /* Not all platforms have a merged SATD function */
2840 if( h->pixf.sa8d_satd[PIXEL_16x16] )
2843 for( int p = 0; p < plane_count; p++ )
2845 cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2846 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2849 i_cost8 = (uint32_t)cost;
2850 i_cost4 = (uint32_t)(cost >> 32);
2854 for( int p = 0; p < plane_count; p++ )
2856 i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2857 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2858 i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2859 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2863 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2864 h->mb.b_skip_mc = 1;
2868 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2870 if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
2872 uint32_t subpart_bak = M32( h->mb.i_sub_partition );
2873 /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
2874 if( h->mb.i_type == P_8x8 )
2875 M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
2876 else if( !x264_transform_allowed[h->mb.i_type] )
2879 x264_analyse_update_cache( h, a );
2880 h->mb.b_transform_8x8 ^= 1;
2881 /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
2882 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2884 if( *i_rd >= i_rd8 )
2887 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2892 h->mb.b_transform_8x8 ^= 1;
2893 M32( h->mb.i_sub_partition ) = subpart_bak;
2898 /* Rate-distortion optimal QP selection.
2899 * FIXME: More than half of the benefit of this function seems to be
2900 * in the way it improves the coding of chroma DC (by decimating or
2901 * finding a better way to code a single DC coefficient.)
2902 * There must be a more efficient way to get that portion of the benefit
2903 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2905 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2907 int bcost, cost, failures, prevcost, origcost;
2908 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2909 int last_qp_tried = 0;
2910 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2911 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2913 /* If CBP is already zero, don't raise the quantizer any higher. */
2914 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2916 /* Without psy-RD, require monotonicity when moving quant away from previous
2917 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2918 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2919 * allow 2 failures when moving quant towards previous quant.
2920 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2921 int threshold = (!!h->mb.i_psy_rd);
2922 /* Raise the threshold for failures if we're moving towards the last QP. */
2923 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2924 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2926 h->mb.i_qp = orig_qp;
2928 prevcost = origcost;
2930 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2931 * (up to a point) will too. So, jump down to where the threshold will kick in
2932 * and check the QP there. If the CBP is still empty, skip the main loop.
2933 * If it isn't empty, we would have ended up having to check this QP anyways,
2934 * so as long as we store it for later lookup, we lose nothing. */
2935 int already_checked_qp = -1;
2936 int already_checked_cost = COST_MAX;
2937 if( direction == -1 )
2941 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
2942 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2943 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2944 if( !h->mb.cbp[h->mb.i_mb_xy] )
2946 /* If our empty-CBP block is lower QP than the last QP,
2947 * the last QP almost surely doesn't have a CBP either. */
2948 if( h->mb.i_last_qp > h->mb.i_qp )
2952 already_checked_qp = h->mb.i_qp;
2953 h->mb.i_qp = orig_qp;
2957 h->mb.i_qp += direction;
2958 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2960 if( h->mb.i_last_qp == h->mb.i_qp )
2962 if( h->mb.i_qp == already_checked_qp )
2963 cost = already_checked_cost;
2966 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2967 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2968 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2971 /* We can't assume that the costs are monotonic over QPs.
2972 * Tie case-as-failure seems to give better results. */
2973 if( cost < prevcost )
2979 if( failures > threshold )
2981 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2983 h->mb.i_qp += direction;
2987 /* Always try the last block's QP. */
2988 if( !last_qp_tried )
2990 h->mb.i_qp = h->mb.i_last_qp;
2991 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2992 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2993 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2997 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2999 /* Check transform again; decision from before may no longer be optimal. */
3000 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
3001 x264_mb_transform_8x8_allowed( h ) )
3003 h->mb.b_transform_8x8 ^= 1;
3004 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3006 h->mb.b_transform_8x8 ^= 1;
3010 /*****************************************************************************
3011 * x264_macroblock_analyse:
3012 *****************************************************************************/
3013 void x264_macroblock_analyse( x264_t *h )
3015 x264_mb_analysis_t analysis;
3016 int i_cost = COST_MAX;
3018 h->mb.i_qp = x264_ratecontrol_mb_qp( h );
3019 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
3020 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
3021 if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
3022 h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
3024 if( h->param.analyse.b_mb_info )
3025 h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
3026 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
3028 /*--------------------------- Do the analysis ---------------------------*/
3029 if( h->sh.i_type == SLICE_TYPE_I )
3032 if( analysis.i_mbrd )
3033 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3034 x264_mb_analyse_intra( h, &analysis, COST_MAX );
3035 if( analysis.i_mbrd )
3036 x264_intra_rd( h, &analysis, COST_MAX );
3038 i_cost = analysis.i_satd_i16x16;
3039 h->mb.i_type = I_16x16;
3040 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
3041 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
3042 if( analysis.i_satd_pcm < i_cost )
3043 h->mb.i_type = I_PCM;
3045 else if( analysis.i_mbrd >= 2 )
3046 x264_intra_rd_refine( h, &analysis );
3048 else if( h->sh.i_type == SLICE_TYPE_P )
3052 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
3054 analysis.b_try_skip = 0;
3055 if( analysis.b_force_intra )
3057 if( !h->param.analyse.b_psy )
3059 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3060 goto intra_analysis;
3065 /* Special fast-skip logic using information from mb_info. */
3066 if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
3068 if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
3069 h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
3071 h->mb.i_partition = D_16x16;
3072 /* Use the P-SKIP MV if we can... */
3073 if( !M32(h->mb.cache.pskip_mv) )
3076 h->mb.i_type = P_SKIP;
3078 /* Otherwise, just force a 16x16 block. */
3081 h->mb.i_type = P_L0;
3082 analysis.l0.me16x16.i_ref = 0;
3083 M32( analysis.l0.me16x16.mv ) = 0;
3087 /* Reset the information accordingly */
3088 else if( h->param.analyse.b_mb_info_update )
3089 h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
3092 int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
3093 /* If the current macroblock is off the frame, just skip it. */
3094 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
3096 /* Fast P_SKIP detection */
3097 else if( h->param.analyse.b_fast_pskip )
3100 // FIXME don't need to check this if the reference frame is done
3102 else if( h->param.analyse.i_subpel_refine >= 3 )
3103 analysis.b_try_skip = 1;
3104 else if( h->mb.i_mb_type_left[0] == P_SKIP ||
3105 h->mb.i_mb_type_top == P_SKIP ||
3106 h->mb.i_mb_type_topleft == P_SKIP ||
3107 h->mb.i_mb_type_topright == P_SKIP )
3108 b_skip = x264_macroblock_probe_pskip( h );
3112 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
3116 h->mb.i_type = P_SKIP;
3117 h->mb.i_partition = D_16x16;
3118 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
3120 /* Set up MVs for future predictors */
3121 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3122 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3126 const unsigned int flags = h->param.analyse.inter;
3129 int i_satd_inter, i_satd_intra;
3131 x264_mb_analyse_load_costs( h, &analysis );
3133 x264_mb_analyse_inter_p16x16( h, &analysis );
3135 if( h->mb.i_type == P_SKIP )
3137 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3138 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3142 if( flags & X264_ANALYSE_PSUB16x16 )
3144 if( h->param.analyse.b_mixed_references )
3145 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
3147 x264_mb_analyse_inter_p8x8( h, &analysis );
3150 /* Select best inter mode */
3152 i_partition = D_16x16;
3153 i_cost = analysis.l0.me16x16.cost;
3155 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3156 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
3159 i_partition = D_8x8;
3160 i_cost = analysis.l0.i_cost8x8;
3163 if( flags & X264_ANALYSE_PSUB8x8 )
3165 for( int i = 0; i < 4; i++ )
3167 x264_mb_analyse_inter_p4x4( h, &analysis, i );
3168 int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
3169 if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
3171 int i_cost8x8 = analysis.l0.i_cost4x4[i];
3172 h->mb.i_sub_partition[i] = D_L0_4x4;
3174 x264_mb_analyse_inter_p8x4( h, &analysis, i );
3175 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
3176 h->mb.i_sub_partition[i], D_L0_8x4 );
3178 x264_mb_analyse_inter_p4x8( h, &analysis, i );
3179 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
3180 h->mb.i_sub_partition[i], D_L0_4x8 );
3182 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
3184 x264_mb_cache_mv_p8x8( h, &analysis, i );
3186 analysis.l0.i_cost8x8 = i_cost;
3190 /* Now do 16x8/8x16 */
3191 int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
3192 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3193 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
3195 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
3196 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3197 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3199 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
3200 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
3202 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
3203 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3204 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3206 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
3207 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
3210 h->mb.i_partition = i_partition;
3213 //FIXME mb_type costs?
3214 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3218 else if( i_partition == D_16x16 )
3220 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3221 i_cost = analysis.l0.me16x16.cost;
3223 else if( i_partition == D_16x8 )
3225 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
3226 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
3227 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
3229 else if( i_partition == D_8x16 )
3231 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
3232 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
3233 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
3235 else if( i_partition == D_8x8 )
3238 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3240 switch( h->mb.i_sub_partition[i8x8] )
3243 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3244 i_cost += analysis.l0.me8x8[i8x8].cost;
3247 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3248 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3249 i_cost += analysis.l0.me8x4[i8x8][0].cost +
3250 analysis.l0.me8x4[i8x8][1].cost;
3253 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3254 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3255 i_cost += analysis.l0.me4x8[i8x8][0].cost +
3256 analysis.l0.me4x8[i8x8][1].cost;
3260 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3261 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3262 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3263 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3264 i_cost += analysis.l0.me4x4[i8x8][0].cost +
3265 analysis.l0.me4x4[i8x8][1].cost +
3266 analysis.l0.me4x4[i8x8][2].cost +
3267 analysis.l0.me4x4[i8x8][3].cost;
3270 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3276 if( h->mb.b_chroma_me )
3280 x264_mb_analyse_intra( h, &analysis, i_cost );
3281 x264_mb_analyse_intra_chroma( h, &analysis );
3285 x264_mb_analyse_intra_chroma( h, &analysis );
3286 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
3288 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3289 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3290 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3293 x264_mb_analyse_intra( h, &analysis, i_cost );
3295 i_satd_inter = i_cost;
3296 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3297 analysis.i_satd_i8x8,
3298 analysis.i_satd_i4x4 );
3300 if( analysis.i_mbrd )
3302 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3304 i_partition = D_16x16;
3305 i_cost = analysis.l0.i_rd16x16;
3306 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3307 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3308 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3309 h->mb.i_type = i_type;
3310 h->mb.i_partition = i_partition;
3311 if( i_cost < COST_MAX )
3312 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3313 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
3316 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3317 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3318 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3319 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3321 h->mb.i_type = i_type;
3323 if( analysis.b_force_intra && !IS_INTRA(i_type) )
3325 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3326 * it was an inter block. */
3327 x264_analyse_update_cache( h, &analysis );
3328 x264_macroblock_encode( h );
3329 for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
3330 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
3333 int height = 16 >> CHROMA_V_SHIFT;
3334 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
3335 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
3337 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3338 goto intra_analysis;
3341 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3343 if( IS_INTRA( h->mb.i_type ) )
3345 x264_intra_rd_refine( h, &analysis );
3347 else if( i_partition == D_16x16 )
3349 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3350 analysis.l0.me16x16.cost = i_cost;
3351 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3353 else if( i_partition == D_16x8 )
3355 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3356 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3357 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3358 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3359 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3360 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3362 else if( i_partition == D_8x16 )
3364 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3365 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3366 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3367 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3368 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3369 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3371 else if( i_partition == D_8x8 )
3373 x264_analyse_update_cache( h, &analysis );
3374 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3376 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3378 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3380 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3382 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3383 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3385 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3387 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3388 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3390 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3392 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3393 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3394 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3395 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3402 else if( h->sh.i_type == SLICE_TYPE_B )
3404 int i_bskip_cost = COST_MAX;
3407 if( analysis.i_mbrd )
3408 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3410 h->mb.i_type = B_SKIP;
3411 if( h->mb.b_direct_auto_write )
3413 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3414 for( int i = 0; i < 2; i++ )
3417 h->sh.b_direct_spatial_mv_pred ^= 1;
3418 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3419 if( analysis.b_direct_available )
3424 b_skip = x264_macroblock_probe_bskip( h );
3426 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3433 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3435 analysis.b_try_skip = 0;
3436 if( analysis.b_direct_available )
3438 if( !h->mb.b_direct_auto_write )
3440 /* If the current macroblock is off the frame, just skip it. */
3441 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
3443 else if( analysis.i_mbrd )
3445 i_bskip_cost = ssd_mb( h );
3446 /* 6 = minimum cavlc cost of a non-skipped MB */
3447 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3449 else if( !h->mb.b_direct_auto_write )
3451 /* Conditioning the probe on neighboring block types
3452 * doesn't seem to help speed or quality. */
3453 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3454 if( h->param.analyse.i_subpel_refine < 3 )
3455 b_skip = analysis.b_try_skip;
3457 /* Set up MVs for future predictors */
3460 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3461 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3462 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3463 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3469 const unsigned int flags = h->param.analyse.inter;
3473 h->mb.b_skip_mc = 0;
3474 h->mb.i_type = B_DIRECT;
3476 x264_mb_analyse_load_costs( h, &analysis );
3478 /* select best inter mode */
3479 /* direct must be first */
3480 if( analysis.b_direct_available )
3481 x264_mb_analyse_inter_direct( h, &analysis );
3483 x264_mb_analyse_inter_b16x16( h, &analysis );
3485 if( h->mb.i_type == B_SKIP )
3487 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3488 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3489 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3490 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3495 i_partition = D_16x16;
3496 i_cost = analysis.l0.me16x16.cost;
3497 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3498 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3499 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3501 if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
3503 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3504 if( i_bskip_cost < analysis.i_rd16x16direct &&
3505 i_bskip_cost < analysis.i_rd16x16bi &&
3506 i_bskip_cost < analysis.l0.i_rd16x16 &&
3507 i_bskip_cost < analysis.l1.i_rd16x16 )
3509 h->mb.i_type = B_SKIP;
3510 x264_analyse_update_cache( h, &analysis );
3515 if( flags & X264_ANALYSE_BSUB16x16 )
3517 if( h->param.analyse.b_mixed_references )
3518 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3520 x264_mb_analyse_inter_b8x8( h, &analysis );
3522 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3524 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3525 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3526 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3527 for( int i = 0; i < 2; i++ )
3529 int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3530 int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3532 i_best_cost = COST_MAX;
3533 i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3534 i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3535 i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3536 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3537 + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3538 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3539 + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3540 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3541 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3542 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3543 analysis.i_cost_est16x8[i] = i_best_cost;
3546 i_best_cost = COST_MAX;
3547 i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3548 i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3549 i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3550 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3551 + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3552 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3553 + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3554 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3555 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3556 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3557 analysis.i_cost_est8x16[i] = i_best_cost;
3559 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3560 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3561 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3562 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3563 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3564 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3566 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3567 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3568 if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3570 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3571 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3573 if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
3575 x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3576 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3578 if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3580 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3581 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3585 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3590 else if( i_partition == D_16x16 )
3592 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3593 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3594 if( i_type == B_L0_L0 )
3596 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3597 i_cost = analysis.l0.me16x16.cost
3598 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3600 else if( i_type == B_L1_L1 )
3602 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3603 i_cost = analysis.l1.me16x16.cost
3604 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3606 else if( i_type == B_BI_BI )
3608 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3609 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3612 else if( i_partition == D_16x8 )
3614 for( int i = 0; i < 2; i++ )
3616 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3617 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3618 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3619 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3622 else if( i_partition == D_8x16 )
3624 for( int i = 0; i < 2; i++ )
3626 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3627 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3628 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3629 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3632 else if( i_partition == D_8x8 )
3634 for( int i = 0; i < 4; i++ )
3637 int i_part_cost_old;
3639 int i_part_type = h->mb.i_sub_partition[i];
3640 int b_bidir = (i_part_type == D_BI_8x8);
3642 if( i_part_type == D_DIRECT_8x8 )
3644 if( x264_mb_partition_listX_table[0][i_part_type] )
3646 m = &analysis.l0.me8x8[i];
3647 i_part_cost_old = m->cost;
3648 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3649 m->cost -= i_type_cost;
3650 x264_me_refine_qpel( h, m );
3652 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3654 if( x264_mb_partition_listX_table[1][i_part_type] )
3656 m = &analysis.l1.me8x8[i];
3657 i_part_cost_old = m->cost;
3658 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3659 m->cost -= i_type_cost;
3660 x264_me_refine_qpel( h, m );
3662 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3664 /* TODO: update mvp? */
3668 i_satd_inter = i_cost;
3670 if( analysis.i_mbrd )
3672 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3674 i_cost = i_bskip_cost;
3675 i_partition = D_16x16;
3676 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3677 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3678 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3679 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3680 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3681 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3682 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3684 h->mb.i_type = i_type;
3685 h->mb.i_partition = i_partition;
3688 if( h->mb.b_chroma_me )
3692 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3693 x264_mb_analyse_intra_chroma( h, &analysis );
3697 x264_mb_analyse_intra_chroma( h, &analysis );
3698 x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
3700 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3701 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3702 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3705 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3707 if( analysis.i_mbrd )
3709 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3710 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
3713 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3714 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3715 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3716 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3718 h->mb.i_type = i_type;
3719 h->mb.i_partition = i_partition;
3721 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3722 x264_intra_rd_refine( h, &analysis );
3723 if( h->mb.i_subpel_refine >= 5 )
3724 x264_refine_bidir( h, &analysis );
3726 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3729 x264_analyse_update_cache( h, &analysis );
3731 if( i_partition == D_16x16 )
3733 if( i_type == B_L0_L0 )
3735 analysis.l0.me16x16.cost = i_cost;
3736 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3738 else if( i_type == B_L1_L1 )
3740 analysis.l1.me16x16.cost = i_cost;
3741 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3743 else if( i_type == B_BI_BI )
3745 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3746 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3749 else if( i_partition == D_16x8 )
3751 for( int i = 0; i < 2; i++ )
3753 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3754 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3755 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3756 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3757 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3758 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3760 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3761 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3765 else if( i_partition == D_8x16 )
3767 for( int i = 0; i < 2; i++ )
3769 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3770 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3771 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3772 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3773 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3774 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3776 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3777 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3781 else if( i_partition == D_8x8 )
3783 for( int i = 0; i < 4; i++ )
3785 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3786 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3787 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3788 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3789 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3791 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3792 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3800 x264_analyse_update_cache( h, &analysis );
3802 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3803 * without realizing it. Check for this and account for it if necessary. */
3804 if( analysis.i_mbrd >= 2 )
3806 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3807 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3808 int list = check_mv_lists[h->mb.i_type] - 1;
3809 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3810 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3811 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3812 h->mb.i_partition = D_16x16;
3815 if( !analysis.i_mbrd )
3816 x264_mb_analyse_transform( h );
3818 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3819 x264_mb_analyse_qp_rd( h, &analysis );
3821 h->mb.b_trellis = h->param.analyse.i_trellis;
3822 h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3824 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3825 x264_psy_trellis_init( h, 0 );
3826 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3827 h->mb.i_skip_intra = 0;
3830 /*-------------------- Update MB from the analysis ----------------------*/
3831 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3833 switch( h->mb.i_type )
3836 for( int i = 0; i < 16; i++ )
3837 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3839 x264_mb_analyse_intra_chroma( h, a );
3842 for( int i = 0; i < 4; i++ )
3843 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3845 x264_mb_analyse_intra_chroma( h, a );
3848 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3849 x264_mb_analyse_intra_chroma( h, a );
3856 switch( h->mb.i_partition )
3859 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3860 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3864 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3865 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3866 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3867 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3871 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3872 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3873 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3874 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3878 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3884 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3885 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3886 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3887 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3888 for( int i = 0; i < 4; i++ )
3889 x264_mb_cache_mv_p8x8( h, a, i );
3894 h->mb.i_partition = D_16x16;
3895 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3896 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3902 h->mb.i_partition = h->mb.cache.direct_partition;
3903 x264_mb_load_mv_direct8x8( h, 0 );
3904 x264_mb_load_mv_direct8x8( h, 1 );
3905 x264_mb_load_mv_direct8x8( h, 2 );
3906 x264_mb_load_mv_direct8x8( h, 3 );
3910 /* optimize: cache might not need to be rewritten */
3911 for( int i = 0; i < 4; i++ )
3912 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3915 default: /* the rest of the B types */
3916 switch( h->mb.i_partition )
3919 switch( h->mb.i_type )
3922 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3923 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3925 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3926 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3927 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3930 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3931 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3932 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3934 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3935 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3938 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3939 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3941 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3942 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3947 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3948 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3951 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3952 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3955 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3961 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3963 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3966 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3969 completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
3970 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
3972 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3973 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3974 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3975 h->mb.cache.mv[l][x264_scan8[15]][0],
3976 h->mb.cache.mv[l][x264_scan8[15]][1] );
3977 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3978 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3979 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3980 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3981 x264_mb_analyse_intra( h, a, COST_MAX );
3982 h->mb.i_type = I_16x16;
3983 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3984 x264_mb_analyse_intra_chroma( h, a );
3991 #include "slicetype.c"