1 /*****************************************************************************
2 * analyse.c: macroblock analysis
3 *****************************************************************************
4 * Copyright (C) 2003-2015 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
28 #define _ISOC99_SOURCE
30 #include "common/common.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
87 int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
92 int i_satd_i16x16_dir[7];
97 ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
101 int i_predict4x4[16];
107 int i_satd_chroma_dir[7];
108 int i_predict8x8chroma;
110 /* II: Inter part P/B frame */
111 x264_mb_analysis_list_t l0;
112 x264_mb_analysis_list_t l1;
114 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115 int i_cost16x16direct;
117 int i_cost8x8direct[4];
118 int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
119 int i_cost_est16x8[2]; /* Per-partition estimated cost */
120 int i_cost_est8x16[2];
129 int i_mb_partition16x8[2]; /* mb_partition_e */
130 int i_mb_partition8x16[2];
131 int i_mb_type16x8; /* mb_class_e */
134 int b_direct_available;
135 int b_early_terminate;
137 } x264_mb_analysis_t;
139 /* lambda = pow(2,qp/6-2) */
140 const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
142 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */
143 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */
144 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */
145 4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */
146 10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */
147 25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */
148 64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */
149 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
150 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
151 1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
152 2048,2299, /* 80-81 */
155 /* lambda2 = pow(lambda,2) * .9 * 256 */
156 /* Capped to avoid overflow */
157 const int x264_lambda2_tab[QP_MAX_MAX+1] =
159 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */
160 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */
161 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */
162 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */
163 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */
164 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */
165 943718, 1189010, 1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
166 5992238, 7549747, 9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
167 38048341, 47937906, 60397977, 76096683, 95875813,120795955, /* 64-69 */
168 134217727,134217727,134217727,134217727,134217727,134217727, /* 70-75 */
169 134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */
172 const uint8_t x264_exp2_lut[64] =
174 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
175 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
176 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
177 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
180 const float x264_log2_lut[128] =
182 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
183 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
184 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
185 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
186 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
187 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
188 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
189 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
190 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
191 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
192 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
193 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
194 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
195 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
196 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
197 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
200 /* Avoid an int/float conversion. */
201 const float x264_log2_lz_lut[32] =
203 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
206 // should the intra and inter lambdas be different?
207 // I'm just matching the behaviour of deadzone quant.
208 static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
210 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
212 46, 58, 73, 92, 117, 147,
213 185, 233, 294, 370, 466, 587,
214 740, 932, 1174, 1480, 1864, 2349,
215 2959, 3728, 4697, 5918, 7457, 9395,
216 11837, 14914, 18790, 23674, 29828, 37581,
217 47349, 59656, 75163, 94699, 119313, 150326,
218 189399, 238627, 300652, 378798, 477255, 601304,
219 757596, 954511, 1202608, 1515192, 1909022, 2405217,
220 3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
221 12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
222 48486154, 61088726, 76966972, 96972308,
223 122177453,134217727,134217727,134217727,134217727,134217727,
224 134217727,134217727,134217727,134217727,134217727,134217727,
226 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
228 27, 34, 43, 54, 68, 86,
229 108, 136, 172, 216, 273, 343,
230 433, 545, 687, 865, 1090, 1374,
231 1731, 2180, 2747, 3461, 4361, 5494,
232 6922, 8721, 10988, 13844, 17442, 21976,
233 27688, 34885, 43953, 55377, 69771, 87906,
234 110755, 139543, 175813, 221511, 279087, 351627,
235 443023, 558174, 703255, 886046, 1116348, 1406511,
236 1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
237 7088374, 8930791, 11252092, 14176748, 17861583, 22504184,
238 28353495, 35723165, 45008368, 56706990,
239 71446330, 90016736,113413980,134217727,134217727,134217727,
240 134217727,134217727,134217727,134217727,134217727,134217727,
241 134217727,134217727,134217727,134217727,134217727,134217727,
245 #define MAX_CHROMA_LAMBDA_OFFSET 36
246 static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
248 16, 20, 25, 32, 40, 50,
249 64, 80, 101, 128, 161, 203,
250 256, 322, 406, 512, 645, 812,
251 1024, 1290, 1625, 2048, 2580, 3250,
252 4096, 5160, 6501, 8192, 10321, 13003,
253 16384, 20642, 26007, 32768, 41285, 52015,
257 /* TODO: calculate CABAC costs */
258 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
260 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
262 static const uint8_t i_mb_b16x8_cost_table[17] =
264 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
266 static const uint8_t i_sub_mb_b_cost_table[13] =
268 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
270 static const uint8_t i_sub_mb_p_cost_table[4] =
275 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
277 static uint16_t x264_cost_ref[QP_MAX+1][3][33];
278 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
279 static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
281 float *x264_analyse_prepare_costs( x264_t *h )
283 float *logs = x264_malloc( (2*4*2048+1)*sizeof(float) );
287 for( int i = 1; i <= 2*4*2048; i++ )
288 logs[i] = log2f(i+1)*2 + 1.718f;
292 int x264_analyse_init_costs( x264_t *h, float *logs, int qp )
294 int lambda = x264_lambda_tab[qp];
297 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
298 CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
299 h->cost_mv[qp] += 2*4*2048;
300 for( int i = 0; i <= 2*4*2048; i++ )
303 h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
305 x264_pthread_mutex_lock( &cost_ref_mutex );
306 for( int i = 0; i < 3; i++ )
307 for( int j = 0; j < 33; j++ )
308 x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
309 x264_pthread_mutex_unlock( &cost_ref_mutex );
310 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
312 for( int j = 0; j < 4; j++ )
314 CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
315 h->cost_mv_fpel[qp][j] += 2*2048;
316 for( int i = -2*2048; i < 2*2048; i++ )
317 h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
320 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
321 for( int i = 0; i < 17; i++ )
322 cost_i4x4_mode[i] = 3*lambda*(i!=8);
328 void x264_analyse_free_costs( x264_t *h )
330 for( int i = 0; i < QP_MAX+1; i++ )
333 x264_free( h->cost_mv[i] - 2*4*2048 );
334 if( h->cost_mv_fpel[i][0] )
335 for( int j = 0; j < 4; j++ )
336 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
340 void x264_analyse_weight_frame( x264_t *h, int end )
342 for( int j = 0; j < h->i_ref[0]; j++ )
344 if( h->sh.weight[j][0].weightfn )
346 x264_frame_t *frame = h->fref[0][j];
347 int width = frame->i_width[0] + 2*PADH;
348 int i_padv = PADV << PARAM_INTERLACED;
350 pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
351 height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
352 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
353 h->fenc->i_lines_weighted += height;
355 for( int k = j; k < h->i_ref[0]; k++ )
356 if( h->sh.weight[k][0].weightfn )
358 pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
359 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
360 src + offset, frame->i_stride[0],
361 width, height, &h->sh.weight[k][0] );
368 /* initialize an array of lambda*nbits for all possible mvs */
369 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
371 a->p_cost_mv = h->cost_mv[a->i_qp];
372 a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
373 a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
376 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
378 int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
379 a->i_lambda = x264_lambda_tab[qp];
380 a->i_lambda2 = x264_lambda2_tab[qp];
382 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
383 if( h->param.analyse.i_trellis )
385 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
386 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
387 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
388 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
390 h->mb.i_psy_rd_lambda = a->i_lambda;
391 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
392 int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
393 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
395 if( qp > QP_MAX_SPEC )
397 h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
398 h->nr_residual_sum = h->nr_residual_sum_buf[1];
399 h->nr_count = h->nr_count_buf[1];
400 h->mb.b_noise_reduction = 1;
401 qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
405 h->nr_offset = h->nr_offset_denoise;
406 h->nr_residual_sum = h->nr_residual_sum_buf[0];
407 h->nr_count = h->nr_count_buf[0];
408 h->mb.b_noise_reduction = 0;
411 a->i_qp = h->mb.i_qp = qp;
412 h->mb.i_chroma_qp = h->chroma_qp_table[qp];
415 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
417 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
419 /* mbrd == 1 -> RD mode decision */
420 /* mbrd == 2 -> RD refinement */
421 /* mbrd == 3 -> QPRD */
422 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
423 h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
424 a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
426 x264_mb_analyse_init_qp( h, a, qp );
428 h->mb.b_transform_8x8 = 0;
434 a->i_satd_chroma = COST_MAX;
436 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
437 * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
438 uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
439 a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
442 a->b_avoid_topright = 0;
444 h->mb.b_lossless ? 0 :
446 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
448 /* II: Inter part P/B frame */
449 if( h->sh.i_type != SLICE_TYPE_I )
451 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
452 // limit motion search to a slightly smaller range than the theoretical limit,
453 // since the search may go a few iterations past its given range
454 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
456 /* Calculate max allowed MV range */
457 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
458 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
459 h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
460 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
461 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
462 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
464 int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
465 int max_mv = max_x - 4*16*h->mb.i_mb_x;
466 /* If we're left of the refresh bar, don't reference right of it. */
467 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
468 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
470 h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
471 h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
472 if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
474 int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
475 int thread_mvy_range = i_fmv_range;
477 if( h->i_thread_frames > 1 )
479 int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
480 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
481 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
482 for( int j = 0; j < h->i_ref[i]; j++ )
484 x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
485 thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
488 if( h->param.b_deterministic )
489 thread_mvy_range = h->param.analyse.i_mv_range_thread;
490 if( PARAM_INTERLACED )
491 thread_mvy_range >>= 1;
493 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
496 if( PARAM_INTERLACED )
498 /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
499 for( int i = 0; i < 3; i++ )
502 mb_y = (h->mb.i_mb_y >> j) + (i == 1);
503 h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
504 h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
505 h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
506 h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
507 h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
508 h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
509 h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
514 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
515 h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
516 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
517 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
518 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
519 h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
520 h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
523 if( PARAM_INTERLACED )
525 int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
526 h->mb.mv_min[1] = h->mb.mv_miny_row[i];
527 h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
528 h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
529 h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
530 h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
531 h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
539 a->l0.i_cost8x16 = COST_MAX;
540 if( h->sh.i_type == SLICE_TYPE_B )
545 a->i_cost8x8direct[0] =
546 a->i_cost8x8direct[1] =
547 a->i_cost8x8direct[2] =
548 a->i_cost8x8direct[3] =
557 a->i_cost16x16direct =
560 a->i_cost8x16bi = COST_MAX;
562 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
563 for( int i = 0; i < 4; i++ )
567 a->l0.i_cost4x8[i] = COST_MAX;
570 /* Fast intra decision */
571 if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
573 /* Always run in fast-intra mode for subme < 3 */
574 if( h->mb.i_subpel_refine > 2 &&
575 ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
576 IS_INTRA( h->mb.i_mb_type_top ) ||
577 IS_INTRA( h->mb.i_mb_type_topleft ) ||
578 IS_INTRA( h->mb.i_mb_type_topright ) ||
579 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
580 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
581 { /* intra is likely */ }
588 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
589 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
591 a->b_force_intra = 1;
593 a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
596 a->b_force_intra = 0;
600 /* Prediction modes allowed for various combinations of neighbors. */
601 /* Terminated by a -1. */
602 /* In order, no neighbors, left, top, top/left, top/left/topleft */
603 static const int8_t i16x16_mode_available[5][5] =
605 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
606 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
607 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
608 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
609 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
612 static const int8_t chroma_mode_available[5][5] =
614 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
615 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
616 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
617 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
618 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
621 static const int8_t i8x8_mode_available[2][5][10] =
624 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
625 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
626 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
627 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
628 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
631 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
632 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
633 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
634 {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
635 {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
639 static const int8_t i4x4_mode_available[2][5][10] =
642 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
643 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
644 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
645 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
646 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
649 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
650 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
651 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
652 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
653 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
657 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
659 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
660 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
661 return i16x16_mode_available[idx];
664 static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
666 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
667 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
668 return chroma_mode_available[idx];
671 static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
673 int avoid_topright = force_intra && (i&1);
674 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
675 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
676 return i8x8_mode_available[avoid_topright][idx];
679 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
681 int avoid_topright = force_intra && ((i&5) == 5);
682 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
683 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
684 return i4x4_mode_available[avoid_topright][idx];
687 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
688 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
690 ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
692 if( do_both_dct || h->mb.b_transform_8x8 )
693 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
694 if( do_both_dct || !h->mb.b_transform_8x8 )
695 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
698 /* Reset fenc satd scores cache for psy RD */
699 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
701 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
702 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
703 if( !h->mb.i_psy_rd )
705 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
706 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
708 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
711 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
713 if( a->i_satd_chroma < COST_MAX )
718 if( !h->mb.b_chroma_me )
720 a->i_satd_chroma = 0;
724 /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
725 if( h->mb.b_lossless )
727 x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
728 x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
732 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
733 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
735 a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
736 + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
740 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
741 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
743 /* Prediction selection for chroma */
744 if( predict_mode[3] >= 0 && !h->mb.b_lossless )
746 int satdu[4], satdv[4];
747 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
748 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
749 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
750 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
751 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
752 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
754 for( ; *predict_mode >= 0; predict_mode++ )
756 int i_mode = *predict_mode;
757 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
759 a->i_satd_chroma_dir[i_mode] = i_satd;
760 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
765 for( ; *predict_mode >= 0; predict_mode++ )
768 int i_mode = *predict_mode;
770 /* we do the prediction */
771 if( h->mb.b_lossless )
772 x264_predict_lossless_chroma( h, i_mode );
775 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
776 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
779 /* we calculate the cost */
780 i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
781 h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
782 a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
784 a->i_satd_chroma_dir[i_mode] = i_satd;
785 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
789 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
792 /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
793 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
795 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
796 pixel *p_src = h->mb.pic.p_fenc[0];
797 pixel *p_dst = h->mb.pic.p_fdec[0];
798 static const int8_t intra_analysis_shortcut[2][2][2][5] =
800 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
801 {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
802 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
803 {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
804 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
805 {-1, -1, -1, -1, -1}},
806 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
807 {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
811 int lambda = a->i_lambda;
813 /*---------------- Try all mode and calculate their score ---------------*/
814 /* Disabled i16x16 for AVC-Intra compat */
815 if( !h->param.i_avcintra_class )
817 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
819 /* Not heavily tuned */
820 static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
821 int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
823 if( !h->mb.b_lossless && predict_mode[3] >= 0 )
825 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
826 a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
827 a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
828 a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
829 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
830 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
831 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
833 /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
834 if( a->i_satd_i16x16 <= i16x16_thresh )
836 h->predict_16x16[I_PRED_16x16_P]( p_dst );
837 a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
838 a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
839 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
844 for( ; *predict_mode >= 0; predict_mode++ )
847 int i_mode = *predict_mode;
849 if( h->mb.b_lossless )
850 x264_predict_lossless_16x16( h, 0, i_mode );
852 h->predict_16x16[i_mode]( p_dst );
854 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
855 lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
856 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
857 a->i_satd_i16x16_dir[i_mode] = i_satd;
861 if( h->sh.i_type == SLICE_TYPE_B )
862 /* cavlc mb type prefix */
863 a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
865 if( a->i_satd_i16x16 > i16x16_thresh )
869 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
870 /* 8x8 prediction selection */
871 if( flags & X264_ANALYSE_I8x8 )
873 ALIGNED_ARRAY_32( pixel, edge,[36] );
874 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
875 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
877 // FIXME some bias like in i4x4?
878 int i_cost = lambda * 4; /* base predmode costs */
879 h->mb.i_cbp_luma = 0;
881 if( h->sh.i_type == SLICE_TYPE_B )
882 i_cost += lambda * i_mb_b_cost_table[I_8x8];
884 for( idx = 0;; idx++ )
888 pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
889 pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
890 int i_best = COST_MAX;
891 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
893 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
894 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
896 if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
898 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
899 i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
900 i_cost += i_best & 0xffff;
902 a->i_predict8x8[idx] = i_best;
903 if( idx == 3 || i_cost > i_satd_thresh )
905 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
909 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
911 ALIGNED_ARRAY_16( int32_t, satd,[9] );
912 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
913 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
914 satd[i_pred_mode] -= 3 * lambda;
915 for( int i = 2; i >= 0; i-- )
918 a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
919 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
922 /* Take analysis shortcuts: don't analyse modes that are too
923 * far away direction-wise from the favored mode. */
924 if( a->i_mbrd < 1 + a->b_fast_intra )
925 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
930 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
933 int i_mode = *predict_mode;
935 if( h->mb.b_lossless )
936 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
938 h->predict_8x8[i_mode]( p_dst_by, edge );
940 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
941 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
942 i_satd -= 3 * lambda;
944 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
945 a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
947 i_cost += i_best + 3*lambda;
949 if( idx == 3 || i_cost > i_satd_thresh )
951 if( h->mb.b_lossless )
952 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
954 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
955 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
957 /* we need to encode this block now (for next ones) */
958 x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
963 a->i_satd_i8x8 = i_cost;
964 if( h->mb.i_skip_intra )
966 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
967 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
968 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
969 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
970 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
971 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
972 if( h->mb.i_skip_intra == 2 )
973 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
978 static const uint16_t cost_div_fix8[3] = {1024,512,341};
979 a->i_satd_i8x8 = COST_MAX;
980 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
982 /* Not heavily tuned */
983 static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
984 if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
988 /* 4x4 prediction selection */
989 if( flags & X264_ANALYSE_I4x4 )
991 int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
992 int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
993 h->mb.i_cbp_luma = 0;
995 if( a->b_early_terminate && a->i_mbrd )
996 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
998 if( h->sh.i_type == SLICE_TYPE_B )
999 i_cost += lambda * i_mb_b_cost_table[I_4x4];
1001 for( idx = 0;; idx++ )
1003 pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
1004 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1005 int i_best = COST_MAX;
1006 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
1008 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1010 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1011 /* emulate missing topright samples */
1012 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1014 if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
1016 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
1017 i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
1018 i_cost += i_best & 0xffff;
1020 a->i_predict4x4[idx] = i_best;
1021 if( i_cost > i_satd_thresh || idx == 15 )
1023 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
1027 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
1029 ALIGNED_ARRAY_16( int32_t, satd,[9] );
1030 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
1031 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
1032 satd[i_pred_mode] -= 3 * lambda;
1033 i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
1034 COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
1035 COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
1037 /* Take analysis shortcuts: don't analyse modes that are too
1038 * far away direction-wise from the favored mode. */
1039 if( a->i_mbrd < 1 + a->b_fast_intra )
1040 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
1047 for( ; *predict_mode >= 0; predict_mode++ )
1050 int i_mode = *predict_mode;
1052 if( h->mb.b_lossless )
1053 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
1055 h->predict_4x4[i_mode]( p_dst_by );
1057 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
1058 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
1060 i_satd -= lambda * 3;
1064 a->i_predict4x4[idx] = i_mode;
1069 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
1073 i_cost += i_best + 3 * lambda;
1074 if( i_cost > i_satd_thresh || idx == 15 )
1076 if( h->mb.b_lossless )
1077 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
1079 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
1080 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1082 /* we need to encode this block now (for next ones) */
1083 x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
1087 a->i_satd_i4x4 = i_cost;
1088 if( h->mb.i_skip_intra )
1090 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1091 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1092 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1093 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1094 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1095 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1096 if( h->mb.i_skip_intra == 2 )
1097 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1101 a->i_satd_i4x4 = COST_MAX;
1105 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1107 if( !a->b_early_terminate )
1108 i_satd_thresh = COST_MAX;
1110 if( a->i_satd_i16x16 < i_satd_thresh )
1112 h->mb.i_type = I_16x16;
1113 x264_analyse_update_cache( h, a );
1114 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1117 a->i_satd_i16x16 = COST_MAX;
1119 if( a->i_satd_i4x4 < i_satd_thresh )
1121 h->mb.i_type = I_4x4;
1122 x264_analyse_update_cache( h, a );
1123 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1126 a->i_satd_i4x4 = COST_MAX;
1128 if( a->i_satd_i8x8 < i_satd_thresh )
1130 h->mb.i_type = I_8x8;
1131 x264_analyse_update_cache( h, a );
1132 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1133 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1136 a->i_satd_i8x8 = COST_MAX;
1139 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1141 uint64_t i_satd, i_best;
1142 int plane_count = CHROMA444 ? 3 : 1;
1143 h->mb.i_skip_intra = 0;
1145 if( h->mb.i_type == I_16x16 )
1147 int old_pred_mode = a->i_predict16x16;
1148 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1149 int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
1150 i_best = a->i_satd_i16x16;
1151 for( ; *predict_mode >= 0; predict_mode++ )
1153 int i_mode = *predict_mode;
1154 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1156 h->mb.i_intra16x16_pred_mode = i_mode;
1157 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1158 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1162 /* RD selection for chroma prediction */
1165 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
1166 if( predict_mode[1] >= 0 )
1168 int8_t predict_mode_sorted[4];
1170 int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
1172 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1174 int i_mode = *predict_mode;
1175 if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1176 predict_mode_sorted[i_max++] = i_mode;
1181 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1182 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1183 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1184 * coefs for the current chroma mode are still around, so we only
1185 * have to recount the bits. */
1186 i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1187 for( int i = 0; i < i_max; i++ )
1189 int i_mode = predict_mode_sorted[i];
1190 if( h->mb.b_lossless )
1191 x264_predict_lossless_chroma( h, i_mode );
1194 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
1195 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
1197 /* if we've already found a mode that needs no residual, then
1198 * probably any mode with a residual will be worse.
1199 * so avoid dct on the remaining modes to improve speed. */
1200 i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1201 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1203 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1204 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1209 if( h->mb.i_type == I_4x4 )
1211 pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
1213 for( int idx = 0; idx < 16; idx++ )
1215 pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
1216 h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
1217 h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
1218 i_best = COST_MAX64;
1220 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1222 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1223 for( int p = 0; p < plane_count; p++ )
1224 /* emulate missing topright samples */
1225 MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
1227 for( ; *predict_mode >= 0; predict_mode++ )
1229 int i_mode = *predict_mode;
1230 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1232 if( i_best > i_satd )
1234 a->i_predict4x4[idx] = i_mode;
1236 for( int p = 0; p < plane_count; p++ )
1238 pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
1239 pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
1240 pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
1241 pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
1242 nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
1247 for( int p = 0; p < plane_count; p++ )
1249 MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
1250 MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
1251 MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
1252 MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
1253 h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
1256 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1259 else if( h->mb.i_type == I_8x8 )
1261 ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
1262 pixel4 pels_h[3][2] = {{0}};
1263 pixel pels_v[3][7] = {{0}};
1264 uint16_t nnz[3][2] = {{0}}; //shut up gcc
1265 for( int idx = 0; idx < 4; idx++ )
1269 int s8 = X264_SCAN8_0 + 2*x + 16*y;
1270 pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
1271 h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
1272 h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
1273 int cbp_luma_new = 0;
1274 int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
1276 i_best = COST_MAX64;
1278 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1279 for( int p = 0; p < plane_count; p++ )
1280 h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1282 for( ; *predict_mode >= 0; predict_mode++ )
1284 int i_mode = *predict_mode;
1285 if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
1288 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1289 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
1291 if( i_best > i_satd )
1293 a->i_predict8x8[idx] = i_mode;
1294 cbp_luma_new = h->mb.i_cbp_luma;
1297 for( int p = 0; p < plane_count; p++ )
1299 pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
1300 pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
1302 for( int j = 0; j < 7; j++ )
1303 pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
1304 nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
1305 nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
1309 a->i_cbp_i8x8_luma = cbp_luma_new;
1310 for( int p = 0; p < plane_count; p++ )
1312 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
1313 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
1315 for( int j = 0; j < 7; j++ )
1316 dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
1317 M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
1318 M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
1321 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1326 #define LOAD_FENC(m, src, xoff, yoff) \
1328 (m)->p_cost_mv = a->p_cost_mv; \
1329 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1330 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1331 (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
1332 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1333 (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1334 (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1337 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1339 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1340 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1341 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1342 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1345 (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
1346 (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
1347 (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
1348 (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
1349 (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
1350 (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
1351 (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
1352 (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
1355 (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
1356 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1357 (m)->weight = x264_weight_none; \
1361 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1362 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1363 (m)->weight = h->sh.weight[i_ref];
1365 #define REF_COST(list, ref) \
1366 (a->p_cost_ref[list][ref])
1368 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1372 ALIGNED_4( int16_t mvc[8][2] );
1373 int i_halfpel_thresh = INT_MAX;
1374 int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
1376 /* 16x16 Search on all ref frame */
1377 m.i_pixel = PIXEL_16x16;
1378 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1380 a->l0.me16x16.cost = INT_MAX;
1381 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1383 m.i_ref_cost = REF_COST( 0, i_ref );
1384 i_halfpel_thresh -= m.i_ref_cost;
1386 /* search with ref */
1387 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1388 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1390 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1392 if( h->mb.ref_blind_dupe == i_ref )
1394 CP32( m.mv, a->l0.mvc[0][0] );
1395 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1399 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1400 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1403 /* save mv for predicting neighbors */
1404 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1405 CP32( a->l0.mvc[i_ref][0], m.mv );
1407 /* early termination
1408 * SSD threshold would probably be better than SATD */
1411 && m.cost-m.cost_mv < 300*a->i_lambda
1412 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1413 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1414 && x264_macroblock_probe_pskip( h ) )
1416 h->mb.i_type = P_SKIP;
1417 x264_analyse_update_cache( h, a );
1418 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1422 m.cost += m.i_ref_cost;
1423 i_halfpel_thresh += m.i_ref_cost;
1425 if( m.cost < a->l0.me16x16.cost )
1426 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1429 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1430 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1432 h->mb.i_type = P_L0;
1435 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1436 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1438 h->mb.i_partition = D_16x16;
1439 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1440 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1441 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1442 h->mb.i_type = P_SKIP;
1447 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1450 pixel **p_fenc = h->mb.pic.p_fenc;
1451 int i_maxref = h->mb.pic.i_fref[0]-1;
1453 h->mb.i_partition = D_8x8;
1455 #define CHECK_NEIGHBOUR(i)\
1457 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1458 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1462 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1463 * than those used by the neighbors */
1464 if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1465 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
1468 CHECK_NEIGHBOUR( -8 - 1 );
1469 CHECK_NEIGHBOUR( -8 + 0 );
1470 CHECK_NEIGHBOUR( -8 + 2 );
1471 CHECK_NEIGHBOUR( -8 + 4 );
1472 CHECK_NEIGHBOUR( 0 - 1 );
1473 CHECK_NEIGHBOUR( 2*8 - 1 );
1475 #undef CHECK_NEIGHBOUR
1477 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1478 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1480 for( int i = 0; i < 4; i++ )
1482 x264_me_t *l0m = &a->l0.me8x8[i];
1486 m.i_pixel = PIXEL_8x8;
1488 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1489 l0m->cost = INT_MAX;
1490 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1492 m.i_ref_cost = REF_COST( 0, i_ref );
1494 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1495 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1497 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1498 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1499 if( h->mb.ref_blind_dupe == i_ref )
1501 CP32( m.mv, a->l0.mvc[0][i+1] );
1502 x264_me_refine_qpel_refdupe( h, &m, NULL );
1505 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1507 m.cost += m.i_ref_cost;
1509 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1511 if( m.cost < l0m->cost )
1512 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1513 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1514 i_ref = h->mb.ref_blind_dupe;
1518 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1519 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1521 a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1523 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1524 are effectively zero. */
1525 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1526 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1529 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1530 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1531 /* P_8x8 ref0 has no ref cost */
1532 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1533 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1534 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1535 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1536 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1539 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1541 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1542 * reference frame flags. Thus, if we're not doing mixedrefs, just
1543 * don't bother analysing the dupes. */
1544 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1545 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1546 pixel **p_fenc = h->mb.pic.p_fenc;
1548 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1550 /* XXX Needed for x264_mb_predict_mv */
1551 h->mb.i_partition = D_8x8;
1554 CP32( mvc[0], a->l0.me16x16.mv );
1556 for( int i = 0; i < 4; i++ )
1558 x264_me_t *m = &a->l0.me8x8[i];
1562 m->i_pixel = PIXEL_8x8;
1563 m->i_ref_cost = i_ref_cost;
1565 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1566 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1567 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1569 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1570 x264_me_search( h, m, mvc, i_mvc );
1572 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1574 CP32( mvc[i_mvc], m->mv );
1577 a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1580 m->cost += i_ref_cost;
1581 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1582 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1585 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1586 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1587 /* theoretically this should include 4*ref_cost,
1588 * but 3 seems a better approximation of cabac. */
1589 if( h->param.b_cabac )
1590 a->l0.i_cost8x8 -= i_ref_cost;
1591 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1592 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1595 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1598 pixel **p_fenc = h->mb.pic.p_fenc;
1599 ALIGNED_4( int16_t mvc[3][2] );
1601 /* XXX Needed for x264_mb_predict_mv */
1602 h->mb.i_partition = D_16x8;
1604 for( int i = 0; i < 2; i++ )
1606 x264_me_t *l0m = &a->l0.me16x8[i];
1607 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1608 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1609 const int ref8[2] = { minref, maxref };
1610 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1612 m.i_pixel = PIXEL_16x8;
1614 LOAD_FENC( &m, p_fenc, 0, 8*i );
1615 l0m->cost = INT_MAX;
1616 for( int j = 0; j < i_ref8s; j++ )
1618 const int i_ref = ref8[j];
1619 m.i_ref_cost = REF_COST( 0, i_ref );
1621 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1622 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1623 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1624 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1626 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1627 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1629 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1630 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1631 /* We can only take this shortcut if the first search was performed on ref0. */
1632 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1634 /* We can just leave the MV from the previous ref search. */
1635 x264_me_refine_qpel_refdupe( h, &m, NULL );
1638 x264_me_search( h, &m, mvc, 3 );
1640 m.cost += m.i_ref_cost;
1642 if( m.cost < l0m->cost )
1643 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1646 /* Early termination based on the current SATD score of partition[0]
1647 plus the estimated SATD score of partition[1] */
1648 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1650 a->l0.i_cost16x8 = COST_MAX;
1654 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1655 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1658 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1661 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1664 pixel **p_fenc = h->mb.pic.p_fenc;
1665 ALIGNED_4( int16_t mvc[3][2] );
1667 /* XXX Needed for x264_mb_predict_mv */
1668 h->mb.i_partition = D_8x16;
1670 for( int i = 0; i < 2; i++ )
1672 x264_me_t *l0m = &a->l0.me8x16[i];
1673 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1674 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1675 const int ref8[2] = { minref, maxref };
1676 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1678 m.i_pixel = PIXEL_8x16;
1680 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1681 l0m->cost = INT_MAX;
1682 for( int j = 0; j < i_ref8s; j++ )
1684 const int i_ref = ref8[j];
1685 m.i_ref_cost = REF_COST( 0, i_ref );
1687 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1688 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1689 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1691 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1692 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1694 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1695 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1696 /* We can only take this shortcut if the first search was performed on ref0. */
1697 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1699 /* We can just leave the MV from the previous ref search. */
1700 x264_me_refine_qpel_refdupe( h, &m, NULL );
1703 x264_me_search( h, &m, mvc, 3 );
1705 m.cost += m.i_ref_cost;
1707 if( m.cost < l0m->cost )
1708 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1711 /* Early termination based on the current SATD score of partition[0]
1712 plus the estimated SATD score of partition[1] */
1713 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1715 a->l0.i_cost8x16 = COST_MAX;
1719 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1720 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1723 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1726 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
1727 pixel **p_fref, int i8x8, int size, int chroma )
1729 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1730 pixel *pix2 = pix1+8;
1731 int i_stride = h->mb.pic.i_stride[1];
1732 int chroma_h_shift = chroma <= CHROMA_422;
1733 int chroma_v_shift = chroma == CHROMA_420;
1734 int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
1735 int i_ref = a->l0.me8x8[i8x8].i_ref;
1736 int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1737 x264_weight_t *weight = h->sh.weight[i_ref];
1739 // FIXME weight can be done on 4x4 blocks even if mc is smaller
1740 #define CHROMA4x4MC( width, height, me, x, y ) \
1741 if( chroma == CHROMA_444 ) \
1743 int mvx = (me).mv[0] + 4*2*x; \
1744 int mvy = (me).mv[1] + 4*2*y; \
1745 h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
1746 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
1747 h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
1748 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
1752 int offset = x + (2>>chroma_v_shift)*16*y; \
1753 int chroma_height = (2>>chroma_v_shift)*height; \
1754 h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
1755 (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
1756 if( weight[1].weightfn ) \
1757 weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
1758 if( weight[2].weightfn ) \
1759 weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
1762 if( size == PIXEL_4x4 )
1764 x264_me_t *m = a->l0.me4x4[i8x8];
1765 CHROMA4x4MC( 2,2, m[0], 0,0 );
1766 CHROMA4x4MC( 2,2, m[1], 2,0 );
1767 CHROMA4x4MC( 2,2, m[2], 0,2 );
1768 CHROMA4x4MC( 2,2, m[3], 2,2 );
1770 else if( size == PIXEL_8x4 )
1772 x264_me_t *m = a->l0.me8x4[i8x8];
1773 CHROMA4x4MC( 4,2, m[0], 0,0 );
1774 CHROMA4x4MC( 4,2, m[1], 0,2 );
1778 x264_me_t *m = a->l0.me4x8[i8x8];
1779 CHROMA4x4MC( 2,4, m[0], 0,0 );
1780 CHROMA4x4MC( 2,4, m[1], 2,0 );
1784 int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
1785 int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
1786 return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1787 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1790 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1792 if( CHROMA_FORMAT == CHROMA_444 )
1793 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
1794 else if( CHROMA_FORMAT == CHROMA_422 )
1795 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
1797 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
1800 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1802 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1803 pixel **p_fenc = h->mb.pic.p_fenc;
1804 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1806 /* XXX Needed for x264_mb_predict_mv */
1807 h->mb.i_partition = D_8x8;
1809 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1811 const int idx = 4*i8x8 + i4x4;
1812 const int x4 = block_idx_x[idx];
1813 const int y4 = block_idx_y[idx];
1814 const int i_mvc = (i4x4 == 0);
1816 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1818 m->i_pixel = PIXEL_4x4;
1820 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1821 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1822 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1824 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1825 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1827 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1829 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1830 a->l0.me4x4[i8x8][1].cost +
1831 a->l0.me4x4[i8x8][2].cost +
1832 a->l0.me4x4[i8x8][3].cost +
1833 REF_COST( 0, i_ref ) +
1834 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1835 if( h->mb.b_chroma_me )
1836 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1839 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1841 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1842 pixel **p_fenc = h->mb.pic.p_fenc;
1843 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1845 /* XXX Needed for x264_mb_predict_mv */
1846 h->mb.i_partition = D_8x8;
1848 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1850 const int idx = 4*i8x8 + 2*i8x4;
1851 const int x4 = block_idx_x[idx];
1852 const int y4 = block_idx_y[idx];
1853 const int i_mvc = (i8x4 == 0);
1855 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1857 m->i_pixel = PIXEL_8x4;
1859 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1860 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1861 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1863 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1864 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1866 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1868 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1869 REF_COST( 0, i_ref ) +
1870 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1871 if( h->mb.b_chroma_me )
1872 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1875 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1877 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1878 pixel **p_fenc = h->mb.pic.p_fenc;
1879 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1881 /* XXX Needed for x264_mb_predict_mv */
1882 h->mb.i_partition = D_8x8;
1884 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1886 const int idx = 4*i8x8 + i4x8;
1887 const int x4 = block_idx_x[idx];
1888 const int y4 = block_idx_y[idx];
1889 const int i_mvc = (i4x8 == 0);
1891 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1893 m->i_pixel = PIXEL_4x8;
1895 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1896 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1897 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1899 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1900 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1902 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1904 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1905 REF_COST( 0, i_ref ) +
1906 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1907 if( h->mb.b_chroma_me )
1908 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1911 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1913 ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
1914 ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
1915 int i_chroma_cost = 0;
1916 int chromapix = h->luma2chroma_pixel[i_pixel];
1918 #define COST_BI_CHROMA( m0, m1, width, height ) \
1922 h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
1923 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1924 h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
1925 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1926 h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
1927 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1928 h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
1929 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1933 int v_shift = CHROMA_V_SHIFT; \
1934 int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1935 int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1936 h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
1937 m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1938 h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
1939 m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1941 h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1942 h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1943 i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
1944 + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
1947 if( i_pixel == PIXEL_16x16 )
1948 COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
1949 else if( i_pixel == PIXEL_16x8 )
1950 COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
1951 else if( i_pixel == PIXEL_8x16 )
1952 COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
1954 COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
1956 return i_chroma_cost;
1959 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1961 /* Assumes that fdec still contains the results of
1962 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1964 pixel *p_fenc = h->mb.pic.p_fenc[0];
1965 pixel *p_fdec = h->mb.pic.p_fdec[0];
1967 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1968 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1970 int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
1972 for( int i = 0; i < 4; i++ )
1974 const int x = (i&1)*8;
1975 const int y = (i>>1)*8;
1976 a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1977 &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1978 if( h->mb.b_chroma_me )
1980 int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
1981 int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
1982 a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
1983 &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
1984 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
1985 &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
1987 a->i_cost16x16direct += a->i_cost8x8direct[i];
1990 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1995 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
1996 if( h->mb.b_chroma_me )
1998 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
1999 a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
2000 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
2005 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
2007 ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
2008 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
2010 intptr_t stride0 = 16, stride1 = 16;
2012 ALIGNED_4( int16_t mvc[9][2] );
2013 int try_skip = a->b_try_skip;
2014 int list1_skipped = 0;
2015 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
2016 int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
2017 (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
2020 m.i_pixel = PIXEL_16x16;
2022 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
2024 /* 16x16 Search on list 0 and list 1 */
2025 a->l0.me16x16.cost = INT_MAX;
2026 a->l1.me16x16.cost = INT_MAX;
2027 for( int l = 1; l >= 0; )
2029 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2031 /* This loop is extremely munged in order to facilitate the following order of operations,
2032 * necessary for an efficient fast skip.
2033 * 1. Search list1 ref0.
2034 * 2. Search list0 ref0.
2036 * 4. Search the rest of list0.
2037 * 5. Go back and finish list1.
2039 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
2041 if( try_skip && l == 1 && i_ref > 0 )
2047 m.i_ref_cost = REF_COST( l, i_ref );
2049 /* search with ref */
2050 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
2051 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
2052 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
2053 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
2056 m.cost += m.i_ref_cost;
2058 if( m.cost < lX->me16x16.cost )
2059 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
2061 /* save mv for predicting neighbors */
2062 CP32( lX->mvc[i_ref][0], m.mv );
2063 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
2065 /* Fast skip detection. */
2066 if( i_ref == 0 && try_skip )
2068 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
2069 abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
2075 /* We already tested skip */
2076 h->mb.i_type = B_SKIP;
2077 x264_analyse_update_cache( h, a );
2082 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
2084 if( list1_skipped && l == 0 )
2090 /* get cost of BI mode */
2091 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
2092 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
2093 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
2094 src0 = h->mc.get_ref( pix0, &stride0,
2095 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
2096 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
2097 src1 = h->mc.get_ref( pix1, &stride1,
2098 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
2099 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
2101 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2103 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2105 + a->l0.bi16x16.cost_mv
2106 + a->l1.bi16x16.cost_mv;
2108 if( h->mb.b_chroma_me )
2109 a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
2111 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
2112 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
2114 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
2115 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
2116 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
2117 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
2118 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2119 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2120 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2121 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2122 + ref_costs + l0_mv_cost + l1_mv_cost;
2124 if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
2126 ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
2130 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2131 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2132 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2133 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
2134 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2135 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2136 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2137 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
2141 ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
2142 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2143 int v_shift = CHROMA_V_SHIFT;
2145 if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
2147 int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2148 h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2149 h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
2152 h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2153 h->mb.pic.i_stride[1], 16>>v_shift );
2155 if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
2157 int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2158 h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2159 h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
2162 h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2163 h->mb.pic.i_stride[1], 16>>v_shift );
2165 h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
2166 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2167 h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
2168 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2170 cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
2171 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
2175 if( cost00 < a->i_cost16x16bi )
2177 M32( a->l0.bi16x16.mv ) = 0;
2178 M32( a->l1.bi16x16.mv ) = 0;
2179 a->l0.bi16x16.cost_mv = l0_mv_cost;
2180 a->l1.bi16x16.cost_mv = l1_mv_cost;
2181 a->i_cost16x16bi = cost00;
2186 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
2187 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
2188 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2191 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2196 switch( h->mb.i_sub_partition[i] )
2199 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2202 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2203 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2206 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2207 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2210 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2211 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2212 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2213 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2216 x264_log( h, X264_LOG_ERROR, "internal error\n" );
2221 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2225 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2226 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2227 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2228 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2231 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2232 if( x264_mb_partition_listX_table[0][part] ) \
2234 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2235 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2239 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2240 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
2242 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2244 if( x264_mb_partition_listX_table[1][part] ) \
2246 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2247 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2251 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2252 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
2254 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2257 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2261 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2263 x264_mb_load_mv_direct8x8( h, i );
2266 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
2267 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
2268 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2273 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2276 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2278 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2280 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2282 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2286 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2288 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2289 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2291 /* early termination: if 16x16 chose ref 0, then evalute no refs older
2292 * than those used by the neighbors */
2293 #define CHECK_NEIGHBOUR(i)\
2295 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2296 if( ref > i_maxref[l] )\
2300 for( int l = 0; l < 2; l++ )
2302 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2303 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2304 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2307 CHECK_NEIGHBOUR( -8 - 1 );
2308 CHECK_NEIGHBOUR( -8 + 0 );
2309 CHECK_NEIGHBOUR( -8 + 2 );
2310 CHECK_NEIGHBOUR( -8 + 4 );
2311 CHECK_NEIGHBOUR( 0 - 1 );
2312 CHECK_NEIGHBOUR( 2*8 - 1 );
2316 /* XXX Needed for x264_mb_predict_mv */
2317 h->mb.i_partition = D_8x8;
2321 for( int i = 0; i < 4; i++ )
2327 intptr_t stride[2] = {8,8};
2330 m.i_pixel = PIXEL_8x8;
2331 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2333 for( int l = 0; l < 2; l++ )
2335 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2337 lX->me8x8[i].cost = INT_MAX;
2338 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2340 m.i_ref_cost = REF_COST( l, i_ref );
2342 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2344 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2345 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2346 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2347 m.cost += m.i_ref_cost;
2349 if( m.cost < lX->me8x8[i].cost )
2351 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2352 a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2355 /* save mv for predicting other partitions within this MB */
2356 CP32( lX->mvc[i_ref][i+1], m.mv );
2361 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2362 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
2363 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2364 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
2365 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2366 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2368 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2369 i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2370 + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2371 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2373 if( h->mb.b_chroma_me )
2375 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2376 i_part_cost_bi += i_chroma_cost;
2377 a->i_satd8x8[2][i] += i_chroma_cost;
2380 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2381 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2383 i_part_cost = a->l0.me8x8[i].cost;
2384 h->mb.i_sub_partition[i] = D_L0_8x8;
2385 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2386 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2387 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2388 a->i_cost8x8bi += i_part_cost;
2390 /* XXX Needed for x264_mb_predict_mv */
2391 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2395 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2398 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2401 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2402 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2403 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2405 /* XXX Needed for x264_mb_predict_mv */
2406 h->mb.i_partition = D_8x8;
2410 for( int i = 0; i < 4; i++ )
2415 int i_part_cost_bi = 0;
2416 intptr_t stride[2] = {8,8};
2419 for( int l = 0; l < 2; l++ )
2421 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2422 x264_me_t *m = &lX->me8x8[i];
2423 m->i_pixel = PIXEL_8x8;
2424 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2426 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2427 m->i_ref = lX->me16x16.i_ref;
2429 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2431 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2432 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2433 x264_me_search( h, m, &lX->me16x16.mv, 1 );
2434 a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2435 m->cost += m->i_ref_cost;
2437 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2439 /* save mv for predicting other partitions within this MB */
2440 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2443 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2444 m->mv[0], m->mv[1], 8, 8, x264_weight_none );
2445 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2447 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2448 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2449 i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2450 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2451 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2453 if( h->mb.b_chroma_me )
2455 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2456 i_part_cost_bi += i_chroma_cost;
2457 a->i_satd8x8[2][i] += i_chroma_cost;
2460 i_part_cost = a->l0.me8x8[i].cost;
2461 h->mb.i_sub_partition[i] = D_L0_8x8;
2462 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2463 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2464 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2465 a->i_cost8x8bi += i_part_cost;
2467 /* XXX Needed for x264_mb_predict_mv */
2468 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2472 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2475 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2477 ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
2478 ALIGNED_4( int16_t mvc[3][2] );
2480 h->mb.i_partition = D_16x8;
2481 a->i_cost16x8bi = 0;
2483 for( int i = 0; i < 2; i++ )
2486 int i_part_cost_bi = 0;
2487 intptr_t stride[2] = {16,16};
2490 m.i_pixel = PIXEL_16x8;
2491 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2493 for( int l = 0; l < 2; l++ )
2495 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2496 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2497 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2498 lX->me16x8[i].cost = INT_MAX;
2499 for( int j = 0; j < i_ref8s; j++ )
2501 int i_ref = ref8[j];
2502 m.i_ref_cost = REF_COST( l, i_ref );
2504 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2506 CP32( mvc[0], lX->mvc[i_ref][0] );
2507 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2508 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2510 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2511 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2512 x264_me_search( h, &m, mvc, 3 );
2513 m.cost += m.i_ref_cost;
2515 if( m.cost < lX->me16x8[i].cost )
2516 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2521 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2522 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
2523 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2524 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
2525 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2526 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2528 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2529 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2530 + a->l1.me16x8[i].i_ref_cost;
2532 if( h->mb.b_chroma_me )
2533 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2535 i_part_cost = a->l0.me16x8[i].cost;
2536 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2538 if( a->l1.me16x8[i].cost < i_part_cost )
2540 i_part_cost = a->l1.me16x8[i].cost;
2541 a->i_mb_partition16x8[i] = D_L1_8x8;
2543 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2545 i_part_cost = i_part_cost_bi;
2546 a->i_mb_partition16x8[i] = D_BI_8x8;
2548 a->i_cost16x8bi += i_part_cost;
2550 /* Early termination based on the current SATD score of partition[0]
2551 plus the estimated SATD score of partition[1] */
2552 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2553 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2555 a->i_cost16x8bi = COST_MAX;
2559 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2563 a->i_mb_type16x8 = B_L0_L0
2564 + (a->i_mb_partition16x8[0]>>2) * 3
2565 + (a->i_mb_partition16x8[1]>>2);
2566 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2569 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2571 ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2572 ALIGNED_4( int16_t mvc[3][2] );
2574 h->mb.i_partition = D_8x16;
2575 a->i_cost8x16bi = 0;
2577 for( int i = 0; i < 2; i++ )
2580 int i_part_cost_bi = 0;
2581 intptr_t stride[2] = {8,8};
2584 m.i_pixel = PIXEL_8x16;
2585 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2587 for( int l = 0; l < 2; l++ )
2589 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2590 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2591 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2592 lX->me8x16[i].cost = INT_MAX;
2593 for( int j = 0; j < i_ref8s; j++ )
2595 int i_ref = ref8[j];
2596 m.i_ref_cost = REF_COST( l, i_ref );
2598 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2600 CP32( mvc[0], lX->mvc[i_ref][0] );
2601 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2602 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2604 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2605 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2606 x264_me_search( h, &m, mvc, 3 );
2607 m.cost += m.i_ref_cost;
2609 if( m.cost < lX->me8x16[i].cost )
2610 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2615 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2616 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
2617 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2618 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
2619 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2621 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2622 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2623 + a->l1.me8x16[i].i_ref_cost;
2625 if( h->mb.b_chroma_me )
2626 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2628 i_part_cost = a->l0.me8x16[i].cost;
2629 a->i_mb_partition8x16[i] = D_L0_8x8;
2631 if( a->l1.me8x16[i].cost < i_part_cost )
2633 i_part_cost = a->l1.me8x16[i].cost;
2634 a->i_mb_partition8x16[i] = D_L1_8x8;
2636 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2638 i_part_cost = i_part_cost_bi;
2639 a->i_mb_partition8x16[i] = D_BI_8x8;
2641 a->i_cost8x16bi += i_part_cost;
2643 /* Early termination based on the current SATD score of partition[0]
2644 plus the estimated SATD score of partition[1] */
2645 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2646 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2648 a->i_cost8x16bi = COST_MAX;
2652 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2656 a->i_mb_type8x16 = B_L0_L0
2657 + (a->i_mb_partition8x16[0]>>2) * 3
2658 + (a->i_mb_partition8x16[1]>>2);
2659 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2662 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2664 int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
2666 h->mb.i_type = P_L0;
2667 if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
2669 h->mb.i_partition = D_16x16;
2670 x264_analyse_update_cache( h, a );
2671 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2674 if( a->l0.i_cost16x8 < thresh )
2676 h->mb.i_partition = D_16x8;
2677 x264_analyse_update_cache( h, a );
2678 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2681 a->l0.i_cost16x8 = COST_MAX;
2683 if( a->l0.i_cost8x16 < thresh )
2685 h->mb.i_partition = D_8x16;
2686 x264_analyse_update_cache( h, a );
2687 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2690 a->l0.i_cost8x16 = COST_MAX;
2692 if( a->l0.i_cost8x8 < thresh )
2694 h->mb.i_type = P_8x8;
2695 h->mb.i_partition = D_8x8;
2696 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2698 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2699 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2700 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2701 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2702 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2703 * for future blocks are those left over from previous RDO calls. */
2704 for( int i = 0; i < 4; i++ )
2706 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2707 int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
2708 int subtype, btype = D_L0_8x8;
2709 uint64_t bcost = COST_MAX64;
2710 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2713 if( costs[subtype] > sub8x8_thresh )
2715 h->mb.i_sub_partition[i] = subtype;
2716 x264_mb_cache_mv_p8x8( h, a, i );
2717 if( subtype == btype )
2719 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2720 COPY2_IF_LT( bcost, cost, btype, subtype );
2722 if( h->mb.i_sub_partition[i] != btype )
2724 h->mb.i_sub_partition[i] = btype;
2725 x264_mb_cache_mv_p8x8( h, a, i );
2730 x264_analyse_update_cache( h, a );
2731 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2734 a->l0.i_cost8x8 = COST_MAX;
2737 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2739 int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
2741 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2743 h->mb.i_type = B_DIRECT;
2744 /* Assumes direct/skip MC is still in fdec */
2745 /* Requires b-rdo to be done before intra analysis */
2746 h->mb.b_skip_mc = 1;
2747 x264_analyse_update_cache( h, a );
2748 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2749 h->mb.b_skip_mc = 0;
2752 //FIXME not all the update_cache calls are needed
2753 h->mb.i_partition = D_16x16;
2755 if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
2757 h->mb.i_type = B_L0_L0;
2758 x264_analyse_update_cache( h, a );
2759 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2763 if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
2765 h->mb.i_type = B_L1_L1;
2766 x264_analyse_update_cache( h, a );
2767 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2771 if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
2773 h->mb.i_type = B_BI_BI;
2774 x264_analyse_update_cache( h, a );
2775 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2779 if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
2781 h->mb.i_type = B_8x8;
2782 h->mb.i_partition = D_8x8;
2783 x264_analyse_update_cache( h, a );
2784 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2785 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2789 if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
2791 h->mb.i_type = a->i_mb_type16x8;
2792 h->mb.i_partition = D_16x8;
2793 x264_analyse_update_cache( h, a );
2794 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2798 if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
2800 h->mb.i_type = a->i_mb_type8x16;
2801 h->mb.i_partition = D_8x16;
2802 x264_analyse_update_cache( h, a );
2803 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2807 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2811 if( IS_INTRA(h->mb.i_type) )
2814 switch( h->mb.i_partition )
2817 if( h->mb.i_type == B_BI_BI )
2819 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2820 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2824 for( int i = 0; i < 2; i++ )
2825 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2827 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2828 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2832 for( int i = 0; i < 2; i++ )
2833 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2835 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2836 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2840 for( int i = 0; i < 4; i++ )
2841 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2843 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2844 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2850 static inline void x264_mb_analyse_transform( x264_t *h )
2852 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2854 /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
2857 int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
2858 int i_cost8 = 0, i_cost4 = 0;
2859 /* Not all platforms have a merged SATD function */
2860 if( h->pixf.sa8d_satd[PIXEL_16x16] )
2863 for( int p = 0; p < plane_count; p++ )
2865 cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2866 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2869 i_cost8 = (uint32_t)cost;
2870 i_cost4 = (uint32_t)(cost >> 32);
2874 for( int p = 0; p < plane_count; p++ )
2876 i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2877 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2878 i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2879 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2883 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2884 h->mb.b_skip_mc = 1;
2888 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2890 if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
2892 uint32_t subpart_bak = M32( h->mb.i_sub_partition );
2893 /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
2894 if( h->mb.i_type == P_8x8 )
2895 M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
2896 else if( !x264_transform_allowed[h->mb.i_type] )
2899 x264_analyse_update_cache( h, a );
2900 h->mb.b_transform_8x8 ^= 1;
2901 /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
2902 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2904 if( *i_rd >= i_rd8 )
2907 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2912 h->mb.b_transform_8x8 ^= 1;
2913 M32( h->mb.i_sub_partition ) = subpart_bak;
2918 /* Rate-distortion optimal QP selection.
2919 * FIXME: More than half of the benefit of this function seems to be
2920 * in the way it improves the coding of chroma DC (by decimating or
2921 * finding a better way to code a single DC coefficient.)
2922 * There must be a more efficient way to get that portion of the benefit
2923 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2925 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2927 int bcost, cost, failures, prevcost, origcost;
2928 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2929 int last_qp_tried = 0;
2930 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2931 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2933 /* If CBP is already zero, don't raise the quantizer any higher. */
2934 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2936 /* Without psy-RD, require monotonicity when moving quant away from previous
2937 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2938 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2939 * allow 2 failures when moving quant towards previous quant.
2940 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2941 int threshold = (!!h->mb.i_psy_rd);
2942 /* Raise the threshold for failures if we're moving towards the last QP. */
2943 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2944 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2946 h->mb.i_qp = orig_qp;
2948 prevcost = origcost;
2950 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2951 * (up to a point) will too. So, jump down to where the threshold will kick in
2952 * and check the QP there. If the CBP is still empty, skip the main loop.
2953 * If it isn't empty, we would have ended up having to check this QP anyways,
2954 * so as long as we store it for later lookup, we lose nothing. */
2955 int already_checked_qp = -1;
2956 int already_checked_cost = COST_MAX;
2957 if( direction == -1 )
2961 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
2962 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2963 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2964 if( !h->mb.cbp[h->mb.i_mb_xy] )
2966 /* If our empty-CBP block is lower QP than the last QP,
2967 * the last QP almost surely doesn't have a CBP either. */
2968 if( h->mb.i_last_qp > h->mb.i_qp )
2972 already_checked_qp = h->mb.i_qp;
2973 h->mb.i_qp = orig_qp;
2977 h->mb.i_qp += direction;
2978 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2980 if( h->mb.i_last_qp == h->mb.i_qp )
2982 if( h->mb.i_qp == already_checked_qp )
2983 cost = already_checked_cost;
2986 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2987 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2988 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2991 /* We can't assume that the costs are monotonic over QPs.
2992 * Tie case-as-failure seems to give better results. */
2993 if( cost < prevcost )
2999 if( failures > threshold )
3001 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
3003 h->mb.i_qp += direction;
3007 /* Always try the last block's QP. */
3008 if( !last_qp_tried )
3010 h->mb.i_qp = h->mb.i_last_qp;
3011 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3012 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3013 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3017 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3019 /* Check transform again; decision from before may no longer be optimal. */
3020 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
3021 x264_mb_transform_8x8_allowed( h ) )
3023 h->mb.b_transform_8x8 ^= 1;
3024 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3026 h->mb.b_transform_8x8 ^= 1;
3030 /*****************************************************************************
3031 * x264_macroblock_analyse:
3032 *****************************************************************************/
3033 void x264_macroblock_analyse( x264_t *h )
3035 x264_mb_analysis_t analysis;
3036 int i_cost = COST_MAX;
3038 h->mb.i_qp = x264_ratecontrol_mb_qp( h );
3039 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
3040 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
3041 if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
3042 h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
3044 if( h->param.analyse.b_mb_info )
3045 h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
3046 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
3048 /*--------------------------- Do the analysis ---------------------------*/
3049 if( h->sh.i_type == SLICE_TYPE_I )
3052 if( analysis.i_mbrd )
3053 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3054 x264_mb_analyse_intra( h, &analysis, COST_MAX );
3055 if( analysis.i_mbrd )
3056 x264_intra_rd( h, &analysis, COST_MAX );
3058 i_cost = analysis.i_satd_i16x16;
3059 h->mb.i_type = I_16x16;
3060 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
3061 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
3062 if( analysis.i_satd_pcm < i_cost )
3063 h->mb.i_type = I_PCM;
3065 else if( analysis.i_mbrd >= 2 )
3066 x264_intra_rd_refine( h, &analysis );
3068 else if( h->sh.i_type == SLICE_TYPE_P )
3072 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
3074 analysis.b_try_skip = 0;
3075 if( analysis.b_force_intra )
3077 if( !h->param.analyse.b_psy )
3079 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3080 goto intra_analysis;
3085 /* Special fast-skip logic using information from mb_info. */
3086 if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
3088 if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
3089 h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
3091 h->mb.i_partition = D_16x16;
3092 /* Use the P-SKIP MV if we can... */
3093 if( !M32(h->mb.cache.pskip_mv) )
3096 h->mb.i_type = P_SKIP;
3098 /* Otherwise, just force a 16x16 block. */
3101 h->mb.i_type = P_L0;
3102 analysis.l0.me16x16.i_ref = 0;
3103 M32( analysis.l0.me16x16.mv ) = 0;
3107 /* Reset the information accordingly */
3108 else if( h->param.analyse.b_mb_info_update )
3109 h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
3112 int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
3113 /* If the current macroblock is off the frame, just skip it. */
3114 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
3116 /* Fast P_SKIP detection */
3117 else if( h->param.analyse.b_fast_pskip )
3120 // FIXME don't need to check this if the reference frame is done
3122 else if( h->param.analyse.i_subpel_refine >= 3 )
3123 analysis.b_try_skip = 1;
3124 else if( h->mb.i_mb_type_left[0] == P_SKIP ||
3125 h->mb.i_mb_type_top == P_SKIP ||
3126 h->mb.i_mb_type_topleft == P_SKIP ||
3127 h->mb.i_mb_type_topright == P_SKIP )
3128 b_skip = x264_macroblock_probe_pskip( h );
3132 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
3136 h->mb.i_type = P_SKIP;
3137 h->mb.i_partition = D_16x16;
3138 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
3140 /* Set up MVs for future predictors */
3141 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3142 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3146 const unsigned int flags = h->param.analyse.inter;
3149 int i_satd_inter, i_satd_intra;
3151 x264_mb_analyse_load_costs( h, &analysis );
3153 x264_mb_analyse_inter_p16x16( h, &analysis );
3155 if( h->mb.i_type == P_SKIP )
3157 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3158 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3162 if( flags & X264_ANALYSE_PSUB16x16 )
3164 if( h->param.analyse.b_mixed_references )
3165 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
3167 x264_mb_analyse_inter_p8x8( h, &analysis );
3170 /* Select best inter mode */
3172 i_partition = D_16x16;
3173 i_cost = analysis.l0.me16x16.cost;
3175 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3176 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
3179 i_partition = D_8x8;
3180 i_cost = analysis.l0.i_cost8x8;
3183 if( flags & X264_ANALYSE_PSUB8x8 )
3185 for( int i = 0; i < 4; i++ )
3187 x264_mb_analyse_inter_p4x4( h, &analysis, i );
3188 int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
3189 if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
3191 int i_cost8x8 = analysis.l0.i_cost4x4[i];
3192 h->mb.i_sub_partition[i] = D_L0_4x4;
3194 x264_mb_analyse_inter_p8x4( h, &analysis, i );
3195 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
3196 h->mb.i_sub_partition[i], D_L0_8x4 );
3198 x264_mb_analyse_inter_p4x8( h, &analysis, i );
3199 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
3200 h->mb.i_sub_partition[i], D_L0_4x8 );
3202 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
3204 x264_mb_cache_mv_p8x8( h, &analysis, i );
3206 analysis.l0.i_cost8x8 = i_cost;
3210 /* Now do 16x8/8x16 */
3211 int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
3212 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3213 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
3215 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
3216 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3217 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3219 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
3220 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
3222 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
3223 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3224 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3226 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
3227 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
3230 h->mb.i_partition = i_partition;
3233 //FIXME mb_type costs?
3234 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3238 else if( i_partition == D_16x16 )
3240 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3241 i_cost = analysis.l0.me16x16.cost;
3243 else if( i_partition == D_16x8 )
3245 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
3246 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
3247 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
3249 else if( i_partition == D_8x16 )
3251 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
3252 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
3253 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
3255 else if( i_partition == D_8x8 )
3258 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3260 switch( h->mb.i_sub_partition[i8x8] )
3263 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3264 i_cost += analysis.l0.me8x8[i8x8].cost;
3267 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3268 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3269 i_cost += analysis.l0.me8x4[i8x8][0].cost +
3270 analysis.l0.me8x4[i8x8][1].cost;
3273 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3274 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3275 i_cost += analysis.l0.me4x8[i8x8][0].cost +
3276 analysis.l0.me4x8[i8x8][1].cost;
3280 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3281 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3282 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3283 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3284 i_cost += analysis.l0.me4x4[i8x8][0].cost +
3285 analysis.l0.me4x4[i8x8][1].cost +
3286 analysis.l0.me4x4[i8x8][2].cost +
3287 analysis.l0.me4x4[i8x8][3].cost;
3290 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3296 if( h->mb.b_chroma_me )
3300 x264_mb_analyse_intra( h, &analysis, i_cost );
3301 x264_mb_analyse_intra_chroma( h, &analysis );
3305 x264_mb_analyse_intra_chroma( h, &analysis );
3306 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
3308 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3309 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3310 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3313 x264_mb_analyse_intra( h, &analysis, i_cost );
3315 i_satd_inter = i_cost;
3316 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3317 analysis.i_satd_i8x8,
3318 analysis.i_satd_i4x4 );
3320 if( analysis.i_mbrd )
3322 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3324 i_partition = D_16x16;
3325 i_cost = analysis.l0.i_rd16x16;
3326 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3327 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3328 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3329 h->mb.i_type = i_type;
3330 h->mb.i_partition = i_partition;
3331 if( i_cost < COST_MAX )
3332 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3333 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
3336 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3337 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3338 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3339 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3341 h->mb.i_type = i_type;
3343 if( analysis.b_force_intra && !IS_INTRA(i_type) )
3345 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3346 * it was an inter block. */
3347 x264_analyse_update_cache( h, &analysis );
3348 x264_macroblock_encode( h );
3349 for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
3350 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
3353 int height = 16 >> CHROMA_V_SHIFT;
3354 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
3355 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
3357 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3358 goto intra_analysis;
3361 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3363 if( IS_INTRA( h->mb.i_type ) )
3365 x264_intra_rd_refine( h, &analysis );
3367 else if( i_partition == D_16x16 )
3369 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3370 analysis.l0.me16x16.cost = i_cost;
3371 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3373 else if( i_partition == D_16x8 )
3375 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3376 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3377 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3378 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3379 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3380 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3382 else if( i_partition == D_8x16 )
3384 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3385 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3386 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3387 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3388 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3389 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3391 else if( i_partition == D_8x8 )
3393 x264_analyse_update_cache( h, &analysis );
3394 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3396 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3398 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3400 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3402 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3403 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3405 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3407 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3408 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3410 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3412 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3413 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3414 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3415 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3422 else if( h->sh.i_type == SLICE_TYPE_B )
3424 int i_bskip_cost = COST_MAX;
3427 if( analysis.i_mbrd )
3428 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3430 h->mb.i_type = B_SKIP;
3431 if( h->mb.b_direct_auto_write )
3433 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3434 for( int i = 0; i < 2; i++ )
3437 h->sh.b_direct_spatial_mv_pred ^= 1;
3438 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3439 if( analysis.b_direct_available )
3444 b_skip = x264_macroblock_probe_bskip( h );
3446 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3453 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3455 analysis.b_try_skip = 0;
3456 if( analysis.b_direct_available )
3458 if( !h->mb.b_direct_auto_write )
3460 /* If the current macroblock is off the frame, just skip it. */
3461 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
3463 else if( analysis.i_mbrd )
3465 i_bskip_cost = ssd_mb( h );
3466 /* 6 = minimum cavlc cost of a non-skipped MB */
3467 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3469 else if( !h->mb.b_direct_auto_write )
3471 /* Conditioning the probe on neighboring block types
3472 * doesn't seem to help speed or quality. */
3473 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3474 if( h->param.analyse.i_subpel_refine < 3 )
3475 b_skip = analysis.b_try_skip;
3477 /* Set up MVs for future predictors */
3480 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3481 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3482 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3483 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3489 const unsigned int flags = h->param.analyse.inter;
3493 h->mb.b_skip_mc = 0;
3494 h->mb.i_type = B_DIRECT;
3496 x264_mb_analyse_load_costs( h, &analysis );
3498 /* select best inter mode */
3499 /* direct must be first */
3500 if( analysis.b_direct_available )
3501 x264_mb_analyse_inter_direct( h, &analysis );
3503 x264_mb_analyse_inter_b16x16( h, &analysis );
3505 if( h->mb.i_type == B_SKIP )
3507 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3508 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3509 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3510 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3515 i_partition = D_16x16;
3516 i_cost = analysis.l0.me16x16.cost;
3517 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3518 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3519 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3521 if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
3523 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3524 if( i_bskip_cost < analysis.i_rd16x16direct &&
3525 i_bskip_cost < analysis.i_rd16x16bi &&
3526 i_bskip_cost < analysis.l0.i_rd16x16 &&
3527 i_bskip_cost < analysis.l1.i_rd16x16 )
3529 h->mb.i_type = B_SKIP;
3530 x264_analyse_update_cache( h, &analysis );
3535 if( flags & X264_ANALYSE_BSUB16x16 )
3537 if( h->param.analyse.b_mixed_references )
3538 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3540 x264_mb_analyse_inter_b8x8( h, &analysis );
3542 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3544 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3545 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3546 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3547 for( int i = 0; i < 2; i++ )
3549 int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3550 int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3552 i_best_cost = COST_MAX;
3553 i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3554 i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3555 i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3556 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3557 + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3558 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3559 + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3560 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3561 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3562 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3563 analysis.i_cost_est16x8[i] = i_best_cost;
3566 i_best_cost = COST_MAX;
3567 i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3568 i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3569 i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3570 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3571 + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3572 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3573 + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3574 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3575 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3576 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3577 analysis.i_cost_est8x16[i] = i_best_cost;
3579 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3580 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3581 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3582 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3583 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3584 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3586 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3587 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3588 if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3590 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3591 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3593 if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
3595 x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3596 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3598 if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3600 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3601 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3605 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3610 else if( i_partition == D_16x16 )
3612 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3613 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3614 if( i_type == B_L0_L0 )
3616 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3617 i_cost = analysis.l0.me16x16.cost
3618 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3620 else if( i_type == B_L1_L1 )
3622 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3623 i_cost = analysis.l1.me16x16.cost
3624 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3626 else if( i_type == B_BI_BI )
3628 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3629 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3632 else if( i_partition == D_16x8 )
3634 for( int i = 0; i < 2; i++ )
3636 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3637 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3638 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3639 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3642 else if( i_partition == D_8x16 )
3644 for( int i = 0; i < 2; i++ )
3646 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3647 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3648 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3649 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3652 else if( i_partition == D_8x8 )
3654 for( int i = 0; i < 4; i++ )
3657 int i_part_cost_old;
3659 int i_part_type = h->mb.i_sub_partition[i];
3660 int b_bidir = (i_part_type == D_BI_8x8);
3662 if( i_part_type == D_DIRECT_8x8 )
3664 if( x264_mb_partition_listX_table[0][i_part_type] )
3666 m = &analysis.l0.me8x8[i];
3667 i_part_cost_old = m->cost;
3668 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3669 m->cost -= i_type_cost;
3670 x264_me_refine_qpel( h, m );
3672 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3674 if( x264_mb_partition_listX_table[1][i_part_type] )
3676 m = &analysis.l1.me8x8[i];
3677 i_part_cost_old = m->cost;
3678 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3679 m->cost -= i_type_cost;
3680 x264_me_refine_qpel( h, m );
3682 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3684 /* TODO: update mvp? */
3688 i_satd_inter = i_cost;
3690 if( analysis.i_mbrd )
3692 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3694 i_cost = i_bskip_cost;
3695 i_partition = D_16x16;
3696 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3697 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3698 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3699 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3700 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3701 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3702 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3704 h->mb.i_type = i_type;
3705 h->mb.i_partition = i_partition;
3708 if( h->mb.b_chroma_me )
3712 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3713 x264_mb_analyse_intra_chroma( h, &analysis );
3717 x264_mb_analyse_intra_chroma( h, &analysis );
3718 x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
3720 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3721 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3722 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3725 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3727 if( analysis.i_mbrd )
3729 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3730 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
3733 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3734 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3735 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3736 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3738 h->mb.i_type = i_type;
3739 h->mb.i_partition = i_partition;
3741 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3742 x264_intra_rd_refine( h, &analysis );
3743 if( h->mb.i_subpel_refine >= 5 )
3744 x264_refine_bidir( h, &analysis );
3746 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3749 x264_analyse_update_cache( h, &analysis );
3751 if( i_partition == D_16x16 )
3753 if( i_type == B_L0_L0 )
3755 analysis.l0.me16x16.cost = i_cost;
3756 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3758 else if( i_type == B_L1_L1 )
3760 analysis.l1.me16x16.cost = i_cost;
3761 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3763 else if( i_type == B_BI_BI )
3765 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3766 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3769 else if( i_partition == D_16x8 )
3771 for( int i = 0; i < 2; i++ )
3773 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3774 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3775 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3776 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3777 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3778 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3780 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3781 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3785 else if( i_partition == D_8x16 )
3787 for( int i = 0; i < 2; i++ )
3789 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3790 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3791 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3792 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3793 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3794 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3796 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3797 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3801 else if( i_partition == D_8x8 )
3803 for( int i = 0; i < 4; i++ )
3805 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3806 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3807 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3808 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3809 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3811 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3812 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3820 x264_analyse_update_cache( h, &analysis );
3822 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3823 * without realizing it. Check for this and account for it if necessary. */
3824 if( analysis.i_mbrd >= 2 )
3826 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3827 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3828 int list = check_mv_lists[h->mb.i_type] - 1;
3829 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3830 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3831 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3832 h->mb.i_partition = D_16x16;
3835 if( !analysis.i_mbrd )
3836 x264_mb_analyse_transform( h );
3838 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3839 x264_mb_analyse_qp_rd( h, &analysis );
3841 h->mb.b_trellis = h->param.analyse.i_trellis;
3842 h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3844 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3845 x264_psy_trellis_init( h, 0 );
3846 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3847 h->mb.i_skip_intra = 0;
3850 /*-------------------- Update MB from the analysis ----------------------*/
3851 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3853 switch( h->mb.i_type )
3856 for( int i = 0; i < 16; i++ )
3857 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3859 x264_mb_analyse_intra_chroma( h, a );
3862 for( int i = 0; i < 4; i++ )
3863 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3865 x264_mb_analyse_intra_chroma( h, a );
3868 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3869 x264_mb_analyse_intra_chroma( h, a );
3876 switch( h->mb.i_partition )
3879 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3880 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3884 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3885 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3886 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3887 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3891 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3892 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3893 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3894 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3898 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3904 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3905 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3906 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3907 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3908 for( int i = 0; i < 4; i++ )
3909 x264_mb_cache_mv_p8x8( h, a, i );
3914 h->mb.i_partition = D_16x16;
3915 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3916 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3922 h->mb.i_partition = h->mb.cache.direct_partition;
3923 x264_mb_load_mv_direct8x8( h, 0 );
3924 x264_mb_load_mv_direct8x8( h, 1 );
3925 x264_mb_load_mv_direct8x8( h, 2 );
3926 x264_mb_load_mv_direct8x8( h, 3 );
3930 /* optimize: cache might not need to be rewritten */
3931 for( int i = 0; i < 4; i++ )
3932 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3935 default: /* the rest of the B types */
3936 switch( h->mb.i_partition )
3939 switch( h->mb.i_type )
3942 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3943 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3945 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3946 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3947 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3950 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3951 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3952 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3954 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3955 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3958 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3959 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3961 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3962 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3967 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3968 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3971 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3972 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3975 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3981 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3983 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3986 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3989 completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
3990 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
3992 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3993 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3994 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3995 h->mb.cache.mv[l][x264_scan8[15]][0],
3996 h->mb.cache.mv[l][x264_scan8[15]][1] );
3997 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3998 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3999 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
4000 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
4001 x264_mb_analyse_intra( h, a, COST_MAX );
4002 h->mb.i_type = I_16x16;
4003 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
4004 x264_mb_analyse_intra_chroma( h, a );
4011 #include "slicetype.c"