1 /*****************************************************************************
2 * analyse.c: macroblock analysis
3 *****************************************************************************
4 * Copyright (C) 2003-2015 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
28 #define _ISOC99_SOURCE
30 #include "common/common.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
87 int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
92 int i_satd_i16x16_dir[7];
97 ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
101 int i_predict4x4[16];
107 int i_satd_chroma_dir[7];
108 int i_predict8x8chroma;
110 /* II: Inter part P/B frame */
111 x264_mb_analysis_list_t l0;
112 x264_mb_analysis_list_t l1;
114 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115 int i_cost16x16direct;
117 int i_cost8x8direct[4];
118 int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
119 int i_cost_est16x8[2]; /* Per-partition estimated cost */
120 int i_cost_est8x16[2];
129 int i_mb_partition16x8[2]; /* mb_partition_e */
130 int i_mb_partition8x16[2];
131 int i_mb_type16x8; /* mb_class_e */
134 int b_direct_available;
135 int b_early_terminate;
137 } x264_mb_analysis_t;
139 /* lambda = pow(2,qp/6-2) */
140 const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
142 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */
143 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */
144 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */
145 4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */
146 10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */
147 25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */
148 64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */
149 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
150 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
151 1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
152 2048,2299, /* 80-81 */
155 /* lambda2 = pow(lambda,2) * .9 * 256 */
156 /* Capped to avoid overflow */
157 const int x264_lambda2_tab[QP_MAX_MAX+1] =
159 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */
160 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */
161 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */
162 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */
163 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */
164 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */
165 943718, 1189010, 1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
166 5992238, 7549747, 9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
167 38048341, 47937906, 60397977, 76096683, 95875813,120795955, /* 64-69 */
168 134217727,134217727,134217727,134217727,134217727,134217727, /* 70-75 */
169 134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */
172 const uint8_t x264_exp2_lut[64] =
174 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
175 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
176 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
177 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
180 const float x264_log2_lut[128] =
182 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
183 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
184 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
185 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
186 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
187 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
188 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
189 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
190 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
191 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
192 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
193 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
194 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
195 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
196 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
197 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
200 /* Avoid an int/float conversion. */
201 const float x264_log2_lz_lut[32] =
203 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
206 // should the intra and inter lambdas be different?
207 // I'm just matching the behaviour of deadzone quant.
208 static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
210 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
212 46, 58, 73, 92, 117, 147,
213 185, 233, 294, 370, 466, 587,
214 740, 932, 1174, 1480, 1864, 2349,
215 2959, 3728, 4697, 5918, 7457, 9395,
216 11837, 14914, 18790, 23674, 29828, 37581,
217 47349, 59656, 75163, 94699, 119313, 150326,
218 189399, 238627, 300652, 378798, 477255, 601304,
219 757596, 954511, 1202608, 1515192, 1909022, 2405217,
220 3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
221 12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
222 48486154, 61088726, 76966972, 96972308,
223 122177453,134217727,134217727,134217727,134217727,134217727,
224 134217727,134217727,134217727,134217727,134217727,134217727,
226 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
228 27, 34, 43, 54, 68, 86,
229 108, 136, 172, 216, 273, 343,
230 433, 545, 687, 865, 1090, 1374,
231 1731, 2180, 2747, 3461, 4361, 5494,
232 6922, 8721, 10988, 13844, 17442, 21976,
233 27688, 34885, 43953, 55377, 69771, 87906,
234 110755, 139543, 175813, 221511, 279087, 351627,
235 443023, 558174, 703255, 886046, 1116348, 1406511,
236 1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
237 7088374, 8930791, 11252092, 14176748, 17861583, 22504184,
238 28353495, 35723165, 45008368, 56706990,
239 71446330, 90016736,113413980,134217727,134217727,134217727,
240 134217727,134217727,134217727,134217727,134217727,134217727,
241 134217727,134217727,134217727,134217727,134217727,134217727,
245 #define MAX_CHROMA_LAMBDA_OFFSET 36
246 static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
248 16, 20, 25, 32, 40, 50,
249 64, 80, 101, 128, 161, 203,
250 256, 322, 406, 512, 645, 812,
251 1024, 1290, 1625, 2048, 2580, 3250,
252 4096, 5160, 6501, 8192, 10321, 13003,
253 16384, 20642, 26007, 32768, 41285, 52015,
257 /* TODO: calculate CABAC costs */
258 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
260 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
262 static const uint8_t i_mb_b16x8_cost_table[17] =
264 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
266 static const uint8_t i_sub_mb_b_cost_table[13] =
268 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
270 static const uint8_t i_sub_mb_p_cost_table[4] =
275 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
277 static uint16_t x264_cost_ref[QP_MAX+1][3][33];
278 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
279 static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
281 static int init_costs( x264_t *h, float *logs, int qp )
283 int lambda = x264_lambda_tab[qp];
286 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
287 CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
288 h->cost_mv[qp] += 2*4*2048;
289 for( int i = 0; i <= 2*4*2048; i++ )
292 h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
294 x264_pthread_mutex_lock( &cost_ref_mutex );
295 for( int i = 0; i < 3; i++ )
296 for( int j = 0; j < 33; j++ )
297 x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
298 x264_pthread_mutex_unlock( &cost_ref_mutex );
299 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
301 for( int j = 0; j < 4; j++ )
303 CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
304 h->cost_mv_fpel[qp][j] += 2*2048;
305 for( int i = -2*2048; i < 2*2048; i++ )
306 h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
309 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
310 for( int i = 0; i < 17; i++ )
311 cost_i4x4_mode[i] = 3*lambda*(i!=8);
317 int x264_analyse_init_costs( x264_t *h )
319 float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
324 for( int i = 1; i <= 2*4*2048; i++ )
325 logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
327 for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
328 if( init_costs( h, logs, qp ) )
331 if( init_costs( h, logs, X264_LOOKAHEAD_QP ) )
341 void x264_analyse_free_costs( x264_t *h )
343 for( int i = 0; i < QP_MAX+1; i++ )
346 x264_free( h->cost_mv[i] - 2*4*2048 );
347 if( h->cost_mv_fpel[i][0] )
348 for( int j = 0; j < 4; j++ )
349 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
353 void x264_analyse_weight_frame( x264_t *h, int end )
355 for( int j = 0; j < h->i_ref[0]; j++ )
357 if( h->sh.weight[j][0].weightfn )
359 x264_frame_t *frame = h->fref[0][j];
360 int width = frame->i_width[0] + 2*PADH;
361 int i_padv = PADV << PARAM_INTERLACED;
363 pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
364 height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
365 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
366 h->fenc->i_lines_weighted += height;
368 for( int k = j; k < h->i_ref[0]; k++ )
369 if( h->sh.weight[k][0].weightfn )
371 pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
372 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
373 src + offset, frame->i_stride[0],
374 width, height, &h->sh.weight[k][0] );
381 /* initialize an array of lambda*nbits for all possible mvs */
382 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
384 a->p_cost_mv = h->cost_mv[a->i_qp];
385 a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
386 a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
389 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
391 int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
392 a->i_lambda = x264_lambda_tab[qp];
393 a->i_lambda2 = x264_lambda2_tab[qp];
395 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
396 if( h->param.analyse.i_trellis )
398 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
399 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
400 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
401 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
403 h->mb.i_psy_rd_lambda = a->i_lambda;
404 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
405 int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
406 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
408 if( qp > QP_MAX_SPEC )
410 h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
411 h->nr_residual_sum = h->nr_residual_sum_buf[1];
412 h->nr_count = h->nr_count_buf[1];
413 h->mb.b_noise_reduction = 1;
414 qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
418 h->nr_offset = h->nr_offset_denoise;
419 h->nr_residual_sum = h->nr_residual_sum_buf[0];
420 h->nr_count = h->nr_count_buf[0];
421 h->mb.b_noise_reduction = 0;
424 a->i_qp = h->mb.i_qp = qp;
425 h->mb.i_chroma_qp = h->chroma_qp_table[qp];
428 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
430 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
432 /* mbrd == 1 -> RD mode decision */
433 /* mbrd == 2 -> RD refinement */
434 /* mbrd == 3 -> QPRD */
435 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
436 h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
437 a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
439 x264_mb_analyse_init_qp( h, a, qp );
441 h->mb.b_transform_8x8 = 0;
447 a->i_satd_chroma = COST_MAX;
449 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
450 * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
451 uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
452 a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
455 a->b_avoid_topright = 0;
457 h->mb.b_lossless ? 0 :
459 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
461 /* II: Inter part P/B frame */
462 if( h->sh.i_type != SLICE_TYPE_I )
464 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
465 // limit motion search to a slightly smaller range than the theoretical limit,
466 // since the search may go a few iterations past its given range
467 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
469 /* Calculate max allowed MV range */
470 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
471 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
472 h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
473 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
474 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
475 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
477 int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
478 int max_mv = max_x - 4*16*h->mb.i_mb_x;
479 /* If we're left of the refresh bar, don't reference right of it. */
480 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
481 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
483 h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
484 h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
485 if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
487 int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
488 int thread_mvy_range = i_fmv_range;
490 if( h->i_thread_frames > 1 )
492 int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
493 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
494 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
495 for( int j = 0; j < h->i_ref[i]; j++ )
497 x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
498 thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
501 if( h->param.b_deterministic )
502 thread_mvy_range = h->param.analyse.i_mv_range_thread;
503 if( PARAM_INTERLACED )
504 thread_mvy_range >>= 1;
506 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
509 if( PARAM_INTERLACED )
511 /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
512 for( int i = 0; i < 3; i++ )
515 mb_y = (h->mb.i_mb_y >> j) + (i == 1);
516 h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
517 h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
518 h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
519 h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
520 h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
521 h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
522 h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
527 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
528 h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
529 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
530 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
531 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
532 h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
533 h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
536 if( PARAM_INTERLACED )
538 int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
539 h->mb.mv_min[1] = h->mb.mv_miny_row[i];
540 h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
541 h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
542 h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
543 h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
544 h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
552 a->l0.i_cost8x16 = COST_MAX;
553 if( h->sh.i_type == SLICE_TYPE_B )
558 a->i_cost8x8direct[0] =
559 a->i_cost8x8direct[1] =
560 a->i_cost8x8direct[2] =
561 a->i_cost8x8direct[3] =
570 a->i_cost16x16direct =
573 a->i_cost8x16bi = COST_MAX;
575 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
576 for( int i = 0; i < 4; i++ )
580 a->l0.i_cost4x8[i] = COST_MAX;
583 /* Fast intra decision */
584 if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
586 /* Always run in fast-intra mode for subme < 3 */
587 if( h->mb.i_subpel_refine > 2 &&
588 ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
589 IS_INTRA( h->mb.i_mb_type_top ) ||
590 IS_INTRA( h->mb.i_mb_type_topleft ) ||
591 IS_INTRA( h->mb.i_mb_type_topright ) ||
592 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
593 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
594 { /* intra is likely */ }
601 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
602 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
604 a->b_force_intra = 1;
606 a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
609 a->b_force_intra = 0;
613 /* Prediction modes allowed for various combinations of neighbors. */
614 /* Terminated by a -1. */
615 /* In order, no neighbors, left, top, top/left, top/left/topleft */
616 static const int8_t i16x16_mode_available[5][5] =
618 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
619 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
620 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
621 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
622 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
625 static const int8_t chroma_mode_available[5][5] =
627 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
628 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
629 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
630 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
631 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
634 static const int8_t i8x8_mode_available[2][5][10] =
637 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
638 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
639 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
640 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
641 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
644 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
645 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
646 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
647 {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
648 {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
652 static const int8_t i4x4_mode_available[2][5][10] =
655 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
656 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
657 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
658 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
659 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
662 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
663 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
664 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
665 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
666 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
670 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
672 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
673 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
674 return i16x16_mode_available[idx];
677 static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
679 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
680 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
681 return chroma_mode_available[idx];
684 static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
686 int avoid_topright = force_intra && (i&1);
687 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
688 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
689 return i8x8_mode_available[avoid_topright][idx];
692 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
694 int avoid_topright = force_intra && ((i&5) == 5);
695 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
696 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
697 return i4x4_mode_available[avoid_topright][idx];
700 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
701 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
703 ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
705 if( do_both_dct || h->mb.b_transform_8x8 )
706 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
707 if( do_both_dct || !h->mb.b_transform_8x8 )
708 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
711 /* Reset fenc satd scores cache for psy RD */
712 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
714 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
715 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
716 if( !h->mb.i_psy_rd )
718 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
719 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
721 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
724 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
726 if( a->i_satd_chroma < COST_MAX )
731 if( !h->mb.b_chroma_me )
733 a->i_satd_chroma = 0;
737 /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
738 if( h->mb.b_lossless )
740 x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
741 x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
745 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
746 h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
748 a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
749 + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
753 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
754 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
756 /* Prediction selection for chroma */
757 if( predict_mode[3] >= 0 && !h->mb.b_lossless )
759 int satdu[4], satdv[4];
760 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
761 h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
762 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
763 h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
764 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
765 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
767 for( ; *predict_mode >= 0; predict_mode++ )
769 int i_mode = *predict_mode;
770 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
772 a->i_satd_chroma_dir[i_mode] = i_satd;
773 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
778 for( ; *predict_mode >= 0; predict_mode++ )
781 int i_mode = *predict_mode;
783 /* we do the prediction */
784 if( h->mb.b_lossless )
785 x264_predict_lossless_chroma( h, i_mode );
788 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
789 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
792 /* we calculate the cost */
793 i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
794 h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
795 a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
797 a->i_satd_chroma_dir[i_mode] = i_satd;
798 COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
802 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
805 /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
806 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
808 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
809 pixel *p_src = h->mb.pic.p_fenc[0];
810 pixel *p_dst = h->mb.pic.p_fdec[0];
811 static const int8_t intra_analysis_shortcut[2][2][2][5] =
813 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
814 {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
815 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
816 {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
817 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
818 {-1, -1, -1, -1, -1}},
819 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
820 {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
824 int lambda = a->i_lambda;
826 /*---------------- Try all mode and calculate their score ---------------*/
827 /* Disabled i16x16 for AVC-Intra compat */
828 if( !h->param.i_avcintra_class )
830 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
832 /* Not heavily tuned */
833 static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
834 int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
836 if( !h->mb.b_lossless && predict_mode[3] >= 0 )
838 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
839 a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
840 a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
841 a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
842 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
843 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
844 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
846 /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
847 if( a->i_satd_i16x16 <= i16x16_thresh )
849 h->predict_16x16[I_PRED_16x16_P]( p_dst );
850 a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
851 a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
852 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
857 for( ; *predict_mode >= 0; predict_mode++ )
860 int i_mode = *predict_mode;
862 if( h->mb.b_lossless )
863 x264_predict_lossless_16x16( h, 0, i_mode );
865 h->predict_16x16[i_mode]( p_dst );
867 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
868 lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
869 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
870 a->i_satd_i16x16_dir[i_mode] = i_satd;
874 if( h->sh.i_type == SLICE_TYPE_B )
875 /* cavlc mb type prefix */
876 a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
878 if( a->i_satd_i16x16 > i16x16_thresh )
882 uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
883 /* 8x8 prediction selection */
884 if( flags & X264_ANALYSE_I8x8 )
886 ALIGNED_ARRAY_32( pixel, edge,[36] );
887 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
888 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
890 // FIXME some bias like in i4x4?
891 int i_cost = lambda * 4; /* base predmode costs */
892 h->mb.i_cbp_luma = 0;
894 if( h->sh.i_type == SLICE_TYPE_B )
895 i_cost += lambda * i_mb_b_cost_table[I_8x8];
897 for( idx = 0;; idx++ )
901 pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
902 pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
903 int i_best = COST_MAX;
904 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
906 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
907 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
909 if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
911 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
912 i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
913 i_cost += i_best & 0xffff;
915 a->i_predict8x8[idx] = i_best;
916 if( idx == 3 || i_cost > i_satd_thresh )
918 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
922 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
924 ALIGNED_ARRAY_16( int32_t, satd,[9] );
925 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
926 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
927 satd[i_pred_mode] -= 3 * lambda;
928 for( int i = 2; i >= 0; i-- )
931 a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
932 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
935 /* Take analysis shortcuts: don't analyse modes that are too
936 * far away direction-wise from the favored mode. */
937 if( a->i_mbrd < 1 + a->b_fast_intra )
938 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
943 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
946 int i_mode = *predict_mode;
948 if( h->mb.b_lossless )
949 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
951 h->predict_8x8[i_mode]( p_dst_by, edge );
953 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
954 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
955 i_satd -= 3 * lambda;
957 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
958 a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
960 i_cost += i_best + 3*lambda;
962 if( idx == 3 || i_cost > i_satd_thresh )
964 if( h->mb.b_lossless )
965 x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
967 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
968 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
970 /* we need to encode this block now (for next ones) */
971 x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
976 a->i_satd_i8x8 = i_cost;
977 if( h->mb.i_skip_intra )
979 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
980 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
981 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
982 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
983 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
984 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
985 if( h->mb.i_skip_intra == 2 )
986 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
991 static const uint16_t cost_div_fix8[3] = {1024,512,341};
992 a->i_satd_i8x8 = COST_MAX;
993 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
995 /* Not heavily tuned */
996 static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
997 if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
1001 /* 4x4 prediction selection */
1002 if( flags & X264_ANALYSE_I4x4 )
1004 int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
1005 int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
1006 h->mb.i_cbp_luma = 0;
1008 if( a->b_early_terminate && a->i_mbrd )
1009 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
1011 if( h->sh.i_type == SLICE_TYPE_B )
1012 i_cost += lambda * i_mb_b_cost_table[I_4x4];
1014 for( idx = 0;; idx++ )
1016 pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
1017 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1018 int i_best = COST_MAX;
1019 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
1021 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1023 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1024 /* emulate missing topright samples */
1025 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1027 if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
1029 /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
1030 i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
1031 i_cost += i_best & 0xffff;
1033 a->i_predict4x4[idx] = i_best;
1034 if( i_cost > i_satd_thresh || idx == 15 )
1036 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
1040 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
1042 ALIGNED_ARRAY_16( int32_t, satd,[9] );
1043 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
1044 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
1045 satd[i_pred_mode] -= 3 * lambda;
1046 i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
1047 COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
1048 COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
1050 /* Take analysis shortcuts: don't analyse modes that are too
1051 * far away direction-wise from the favored mode. */
1052 if( a->i_mbrd < 1 + a->b_fast_intra )
1053 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
1060 for( ; *predict_mode >= 0; predict_mode++ )
1063 int i_mode = *predict_mode;
1065 if( h->mb.b_lossless )
1066 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
1068 h->predict_4x4[i_mode]( p_dst_by );
1070 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
1071 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
1073 i_satd -= lambda * 3;
1077 a->i_predict4x4[idx] = i_mode;
1082 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
1086 i_cost += i_best + 3 * lambda;
1087 if( i_cost > i_satd_thresh || idx == 15 )
1089 if( h->mb.b_lossless )
1090 x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
1092 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
1093 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1095 /* we need to encode this block now (for next ones) */
1096 x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
1100 a->i_satd_i4x4 = i_cost;
1101 if( h->mb.i_skip_intra )
1103 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1104 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1105 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1106 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1107 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1108 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1109 if( h->mb.i_skip_intra == 2 )
1110 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1114 a->i_satd_i4x4 = COST_MAX;
1118 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1120 if( !a->b_early_terminate )
1121 i_satd_thresh = COST_MAX;
1123 if( a->i_satd_i16x16 < i_satd_thresh )
1125 h->mb.i_type = I_16x16;
1126 x264_analyse_update_cache( h, a );
1127 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1130 a->i_satd_i16x16 = COST_MAX;
1132 if( a->i_satd_i4x4 < i_satd_thresh )
1134 h->mb.i_type = I_4x4;
1135 x264_analyse_update_cache( h, a );
1136 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1139 a->i_satd_i4x4 = COST_MAX;
1141 if( a->i_satd_i8x8 < i_satd_thresh )
1143 h->mb.i_type = I_8x8;
1144 x264_analyse_update_cache( h, a );
1145 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1146 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1149 a->i_satd_i8x8 = COST_MAX;
1152 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1154 uint64_t i_satd, i_best;
1155 int plane_count = CHROMA444 ? 3 : 1;
1156 h->mb.i_skip_intra = 0;
1158 if( h->mb.i_type == I_16x16 )
1160 int old_pred_mode = a->i_predict16x16;
1161 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1162 int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
1163 i_best = a->i_satd_i16x16;
1164 for( ; *predict_mode >= 0; predict_mode++ )
1166 int i_mode = *predict_mode;
1167 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1169 h->mb.i_intra16x16_pred_mode = i_mode;
1170 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1171 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1175 /* RD selection for chroma prediction */
1178 const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
1179 if( predict_mode[1] >= 0 )
1181 int8_t predict_mode_sorted[4];
1183 int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
1185 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1187 int i_mode = *predict_mode;
1188 if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1189 predict_mode_sorted[i_max++] = i_mode;
1194 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1195 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1196 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1197 * coefs for the current chroma mode are still around, so we only
1198 * have to recount the bits. */
1199 i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1200 for( int i = 0; i < i_max; i++ )
1202 int i_mode = predict_mode_sorted[i];
1203 if( h->mb.b_lossless )
1204 x264_predict_lossless_chroma( h, i_mode );
1207 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
1208 h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
1210 /* if we've already found a mode that needs no residual, then
1211 * probably any mode with a residual will be worse.
1212 * so avoid dct on the remaining modes to improve speed. */
1213 i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1214 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1216 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1217 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1222 if( h->mb.i_type == I_4x4 )
1224 pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
1226 for( int idx = 0; idx < 16; idx++ )
1228 pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
1229 h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
1230 h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
1231 i_best = COST_MAX64;
1233 const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1235 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1236 for( int p = 0; p < plane_count; p++ )
1237 /* emulate missing topright samples */
1238 MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
1240 for( ; *predict_mode >= 0; predict_mode++ )
1242 int i_mode = *predict_mode;
1243 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1245 if( i_best > i_satd )
1247 a->i_predict4x4[idx] = i_mode;
1249 for( int p = 0; p < plane_count; p++ )
1251 pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
1252 pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
1253 pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
1254 pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
1255 nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
1260 for( int p = 0; p < plane_count; p++ )
1262 MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
1263 MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
1264 MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
1265 MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
1266 h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
1269 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1272 else if( h->mb.i_type == I_8x8 )
1274 ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
1275 pixel4 pels_h[3][2] = {{0}};
1276 pixel pels_v[3][7] = {{0}};
1277 uint16_t nnz[3][2] = {{0}}; //shut up gcc
1278 for( int idx = 0; idx < 4; idx++ )
1282 int s8 = X264_SCAN8_0 + 2*x + 16*y;
1283 pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
1284 h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
1285 h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
1286 int cbp_luma_new = 0;
1287 int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
1289 i_best = COST_MAX64;
1291 const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1292 for( int p = 0; p < plane_count; p++ )
1293 h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1295 for( ; *predict_mode >= 0; predict_mode++ )
1297 int i_mode = *predict_mode;
1298 if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
1301 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1302 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
1304 if( i_best > i_satd )
1306 a->i_predict8x8[idx] = i_mode;
1307 cbp_luma_new = h->mb.i_cbp_luma;
1310 for( int p = 0; p < plane_count; p++ )
1312 pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
1313 pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
1315 for( int j = 0; j < 7; j++ )
1316 pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
1317 nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
1318 nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
1322 a->i_cbp_i8x8_luma = cbp_luma_new;
1323 for( int p = 0; p < plane_count; p++ )
1325 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
1326 MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
1328 for( int j = 0; j < 7; j++ )
1329 dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
1330 M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
1331 M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
1334 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1339 #define LOAD_FENC(m, src, xoff, yoff) \
1341 (m)->p_cost_mv = a->p_cost_mv; \
1342 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1343 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1344 (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
1345 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1346 (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1347 (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
1350 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1352 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1353 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1354 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1355 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1358 (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
1359 (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
1360 (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
1361 (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
1362 (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
1363 (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
1364 (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
1365 (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
1368 (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
1369 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1370 (m)->weight = x264_weight_none; \
1374 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1375 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1376 (m)->weight = h->sh.weight[i_ref];
1378 #define REF_COST(list, ref) \
1379 (a->p_cost_ref[list][ref])
1381 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1385 ALIGNED_4( int16_t mvc[8][2] );
1386 int i_halfpel_thresh = INT_MAX;
1387 int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
1389 /* 16x16 Search on all ref frame */
1390 m.i_pixel = PIXEL_16x16;
1391 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1393 a->l0.me16x16.cost = INT_MAX;
1394 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1396 m.i_ref_cost = REF_COST( 0, i_ref );
1397 i_halfpel_thresh -= m.i_ref_cost;
1399 /* search with ref */
1400 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1401 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1403 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1405 if( h->mb.ref_blind_dupe == i_ref )
1407 CP32( m.mv, a->l0.mvc[0][0] );
1408 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1412 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1413 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1416 /* save mv for predicting neighbors */
1417 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1418 CP32( a->l0.mvc[i_ref][0], m.mv );
1420 /* early termination
1421 * SSD threshold would probably be better than SATD */
1424 && m.cost-m.cost_mv < 300*a->i_lambda
1425 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1426 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1427 && x264_macroblock_probe_pskip( h ) )
1429 h->mb.i_type = P_SKIP;
1430 x264_analyse_update_cache( h, a );
1431 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1435 m.cost += m.i_ref_cost;
1436 i_halfpel_thresh += m.i_ref_cost;
1438 if( m.cost < a->l0.me16x16.cost )
1439 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1442 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1443 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1445 h->mb.i_type = P_L0;
1448 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1449 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1451 h->mb.i_partition = D_16x16;
1452 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1453 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1454 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1455 h->mb.i_type = P_SKIP;
1460 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1463 pixel **p_fenc = h->mb.pic.p_fenc;
1464 int i_maxref = h->mb.pic.i_fref[0]-1;
1466 h->mb.i_partition = D_8x8;
1468 #define CHECK_NEIGHBOUR(i)\
1470 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1471 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1475 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1476 * than those used by the neighbors */
1477 if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1478 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
1481 CHECK_NEIGHBOUR( -8 - 1 );
1482 CHECK_NEIGHBOUR( -8 + 0 );
1483 CHECK_NEIGHBOUR( -8 + 2 );
1484 CHECK_NEIGHBOUR( -8 + 4 );
1485 CHECK_NEIGHBOUR( 0 - 1 );
1486 CHECK_NEIGHBOUR( 2*8 - 1 );
1488 #undef CHECK_NEIGHBOUR
1490 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1491 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1493 for( int i = 0; i < 4; i++ )
1495 x264_me_t *l0m = &a->l0.me8x8[i];
1499 m.i_pixel = PIXEL_8x8;
1501 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1502 l0m->cost = INT_MAX;
1503 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1505 m.i_ref_cost = REF_COST( 0, i_ref );
1507 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1508 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1510 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1511 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1512 if( h->mb.ref_blind_dupe == i_ref )
1514 CP32( m.mv, a->l0.mvc[0][i+1] );
1515 x264_me_refine_qpel_refdupe( h, &m, NULL );
1518 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1520 m.cost += m.i_ref_cost;
1522 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1524 if( m.cost < l0m->cost )
1525 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1526 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1527 i_ref = h->mb.ref_blind_dupe;
1531 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1532 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1534 a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1536 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1537 are effectively zero. */
1538 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1539 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1542 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1543 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1544 /* P_8x8 ref0 has no ref cost */
1545 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1546 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1547 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1548 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1549 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1552 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1554 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1555 * reference frame flags. Thus, if we're not doing mixedrefs, just
1556 * don't bother analysing the dupes. */
1557 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1558 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1559 pixel **p_fenc = h->mb.pic.p_fenc;
1561 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1563 /* XXX Needed for x264_mb_predict_mv */
1564 h->mb.i_partition = D_8x8;
1567 CP32( mvc[0], a->l0.me16x16.mv );
1569 for( int i = 0; i < 4; i++ )
1571 x264_me_t *m = &a->l0.me8x8[i];
1575 m->i_pixel = PIXEL_8x8;
1576 m->i_ref_cost = i_ref_cost;
1578 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1579 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1580 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1582 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1583 x264_me_search( h, m, mvc, i_mvc );
1585 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1587 CP32( mvc[i_mvc], m->mv );
1590 a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1593 m->cost += i_ref_cost;
1594 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1595 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1598 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1599 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1600 /* theoretically this should include 4*ref_cost,
1601 * but 3 seems a better approximation of cabac. */
1602 if( h->param.b_cabac )
1603 a->l0.i_cost8x8 -= i_ref_cost;
1604 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1605 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1608 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1611 pixel **p_fenc = h->mb.pic.p_fenc;
1612 ALIGNED_4( int16_t mvc[3][2] );
1614 /* XXX Needed for x264_mb_predict_mv */
1615 h->mb.i_partition = D_16x8;
1617 for( int i = 0; i < 2; i++ )
1619 x264_me_t *l0m = &a->l0.me16x8[i];
1620 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1621 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1622 const int ref8[2] = { minref, maxref };
1623 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1625 m.i_pixel = PIXEL_16x8;
1627 LOAD_FENC( &m, p_fenc, 0, 8*i );
1628 l0m->cost = INT_MAX;
1629 for( int j = 0; j < i_ref8s; j++ )
1631 const int i_ref = ref8[j];
1632 m.i_ref_cost = REF_COST( 0, i_ref );
1634 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1635 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1636 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1637 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1639 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1640 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1642 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1643 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1644 /* We can only take this shortcut if the first search was performed on ref0. */
1645 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1647 /* We can just leave the MV from the previous ref search. */
1648 x264_me_refine_qpel_refdupe( h, &m, NULL );
1651 x264_me_search( h, &m, mvc, 3 );
1653 m.cost += m.i_ref_cost;
1655 if( m.cost < l0m->cost )
1656 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1659 /* Early termination based on the current SATD score of partition[0]
1660 plus the estimated SATD score of partition[1] */
1661 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1663 a->l0.i_cost16x8 = COST_MAX;
1667 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1668 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1671 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1674 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1677 pixel **p_fenc = h->mb.pic.p_fenc;
1678 ALIGNED_4( int16_t mvc[3][2] );
1680 /* XXX Needed for x264_mb_predict_mv */
1681 h->mb.i_partition = D_8x16;
1683 for( int i = 0; i < 2; i++ )
1685 x264_me_t *l0m = &a->l0.me8x16[i];
1686 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1687 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1688 const int ref8[2] = { minref, maxref };
1689 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1691 m.i_pixel = PIXEL_8x16;
1693 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1694 l0m->cost = INT_MAX;
1695 for( int j = 0; j < i_ref8s; j++ )
1697 const int i_ref = ref8[j];
1698 m.i_ref_cost = REF_COST( 0, i_ref );
1700 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1701 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1702 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1704 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1705 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1707 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1708 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1709 /* We can only take this shortcut if the first search was performed on ref0. */
1710 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1712 /* We can just leave the MV from the previous ref search. */
1713 x264_me_refine_qpel_refdupe( h, &m, NULL );
1716 x264_me_search( h, &m, mvc, 3 );
1718 m.cost += m.i_ref_cost;
1720 if( m.cost < l0m->cost )
1721 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1724 /* Early termination based on the current SATD score of partition[0]
1725 plus the estimated SATD score of partition[1] */
1726 if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
1728 a->l0.i_cost8x16 = COST_MAX;
1732 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1733 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1736 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1739 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
1740 pixel **p_fref, int i8x8, int size, int chroma )
1742 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
1743 pixel *pix2 = pix1+8;
1744 int i_stride = h->mb.pic.i_stride[1];
1745 int chroma_h_shift = chroma <= CHROMA_422;
1746 int chroma_v_shift = chroma == CHROMA_420;
1747 int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
1748 int i_ref = a->l0.me8x8[i8x8].i_ref;
1749 int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1750 x264_weight_t *weight = h->sh.weight[i_ref];
1752 // FIXME weight can be done on 4x4 blocks even if mc is smaller
1753 #define CHROMA4x4MC( width, height, me, x, y ) \
1754 if( chroma == CHROMA_444 ) \
1756 int mvx = (me).mv[0] + 4*2*x; \
1757 int mvy = (me).mv[1] + 4*2*y; \
1758 h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
1759 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
1760 h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
1761 mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
1765 int offset = x + (2>>chroma_v_shift)*16*y; \
1766 int chroma_height = (2>>chroma_v_shift)*height; \
1767 h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
1768 (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
1769 if( weight[1].weightfn ) \
1770 weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
1771 if( weight[2].weightfn ) \
1772 weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
1775 if( size == PIXEL_4x4 )
1777 x264_me_t *m = a->l0.me4x4[i8x8];
1778 CHROMA4x4MC( 2,2, m[0], 0,0 );
1779 CHROMA4x4MC( 2,2, m[1], 2,0 );
1780 CHROMA4x4MC( 2,2, m[2], 0,2 );
1781 CHROMA4x4MC( 2,2, m[3], 2,2 );
1783 else if( size == PIXEL_8x4 )
1785 x264_me_t *m = a->l0.me8x4[i8x8];
1786 CHROMA4x4MC( 4,2, m[0], 0,0 );
1787 CHROMA4x4MC( 4,2, m[1], 0,2 );
1791 x264_me_t *m = a->l0.me4x8[i8x8];
1792 CHROMA4x4MC( 2,4, m[0], 0,0 );
1793 CHROMA4x4MC( 2,4, m[1], 2,0 );
1797 int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
1798 int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
1799 return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1800 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1803 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1805 if( CHROMA_FORMAT == CHROMA_444 )
1806 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
1807 else if( CHROMA_FORMAT == CHROMA_422 )
1808 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
1810 return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
1813 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1815 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1816 pixel **p_fenc = h->mb.pic.p_fenc;
1817 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1819 /* XXX Needed for x264_mb_predict_mv */
1820 h->mb.i_partition = D_8x8;
1822 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1824 const int idx = 4*i8x8 + i4x4;
1825 const int x4 = block_idx_x[idx];
1826 const int y4 = block_idx_y[idx];
1827 const int i_mvc = (i4x4 == 0);
1829 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1831 m->i_pixel = PIXEL_4x4;
1833 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1834 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1835 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1837 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1838 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1840 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1842 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1843 a->l0.me4x4[i8x8][1].cost +
1844 a->l0.me4x4[i8x8][2].cost +
1845 a->l0.me4x4[i8x8][3].cost +
1846 REF_COST( 0, i_ref ) +
1847 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1848 if( h->mb.b_chroma_me )
1849 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1852 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1854 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1855 pixel **p_fenc = h->mb.pic.p_fenc;
1856 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1858 /* XXX Needed for x264_mb_predict_mv */
1859 h->mb.i_partition = D_8x8;
1861 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1863 const int idx = 4*i8x8 + 2*i8x4;
1864 const int x4 = block_idx_x[idx];
1865 const int y4 = block_idx_y[idx];
1866 const int i_mvc = (i8x4 == 0);
1868 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1870 m->i_pixel = PIXEL_8x4;
1872 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1873 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1874 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1876 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1877 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1879 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1881 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1882 REF_COST( 0, i_ref ) +
1883 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1884 if( h->mb.b_chroma_me )
1885 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1888 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1890 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1891 pixel **p_fenc = h->mb.pic.p_fenc;
1892 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1894 /* XXX Needed for x264_mb_predict_mv */
1895 h->mb.i_partition = D_8x8;
1897 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1899 const int idx = 4*i8x8 + i4x8;
1900 const int x4 = block_idx_x[idx];
1901 const int y4 = block_idx_y[idx];
1902 const int i_mvc = (i4x8 == 0);
1904 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1906 m->i_pixel = PIXEL_4x8;
1908 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1909 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1910 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1912 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1913 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1915 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1917 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1918 REF_COST( 0, i_ref ) +
1919 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1920 if( h->mb.b_chroma_me )
1921 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1924 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1926 ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
1927 ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
1928 int i_chroma_cost = 0;
1929 int chromapix = h->luma2chroma_pixel[i_pixel];
1931 #define COST_BI_CHROMA( m0, m1, width, height ) \
1935 h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
1936 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1937 h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
1938 m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
1939 h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
1940 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1941 h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
1942 m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
1946 int v_shift = CHROMA_V_SHIFT; \
1947 int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1948 int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1949 h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
1950 m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1951 h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
1952 m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
1954 h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1955 h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1956 i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
1957 + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
1960 if( i_pixel == PIXEL_16x16 )
1961 COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
1962 else if( i_pixel == PIXEL_16x8 )
1963 COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
1964 else if( i_pixel == PIXEL_8x16 )
1965 COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
1967 COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
1969 return i_chroma_cost;
1972 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1974 /* Assumes that fdec still contains the results of
1975 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1977 pixel *p_fenc = h->mb.pic.p_fenc[0];
1978 pixel *p_fdec = h->mb.pic.p_fdec[0];
1980 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1981 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1983 int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
1985 for( int i = 0; i < 4; i++ )
1987 const int x = (i&1)*8;
1988 const int y = (i>>1)*8;
1989 a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1990 &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1991 if( h->mb.b_chroma_me )
1993 int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
1994 int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
1995 a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
1996 &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
1997 + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
1998 &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
2000 a->i_cost16x16direct += a->i_cost8x8direct[i];
2003 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
2008 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
2009 if( h->mb.b_chroma_me )
2011 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2012 a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
2013 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
2018 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
2020 ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
2021 ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
2023 intptr_t stride0 = 16, stride1 = 16;
2025 ALIGNED_4( int16_t mvc[9][2] );
2026 int try_skip = a->b_try_skip;
2027 int list1_skipped = 0;
2028 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
2029 int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
2030 (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
2033 m.i_pixel = PIXEL_16x16;
2035 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
2037 /* 16x16 Search on list 0 and list 1 */
2038 a->l0.me16x16.cost = INT_MAX;
2039 a->l1.me16x16.cost = INT_MAX;
2040 for( int l = 1; l >= 0; )
2042 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2044 /* This loop is extremely munged in order to facilitate the following order of operations,
2045 * necessary for an efficient fast skip.
2046 * 1. Search list1 ref0.
2047 * 2. Search list0 ref0.
2049 * 4. Search the rest of list0.
2050 * 5. Go back and finish list1.
2052 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
2054 if( try_skip && l == 1 && i_ref > 0 )
2060 m.i_ref_cost = REF_COST( l, i_ref );
2062 /* search with ref */
2063 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
2064 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
2065 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
2066 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
2069 m.cost += m.i_ref_cost;
2071 if( m.cost < lX->me16x16.cost )
2072 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
2074 /* save mv for predicting neighbors */
2075 CP32( lX->mvc[i_ref][0], m.mv );
2076 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
2078 /* Fast skip detection. */
2079 if( i_ref == 0 && try_skip )
2081 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
2082 abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
2088 /* We already tested skip */
2089 h->mb.i_type = B_SKIP;
2090 x264_analyse_update_cache( h, a );
2095 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
2097 if( list1_skipped && l == 0 )
2103 /* get cost of BI mode */
2104 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
2105 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
2106 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
2107 src0 = h->mc.get_ref( pix0, &stride0,
2108 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
2109 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
2110 src1 = h->mc.get_ref( pix1, &stride1,
2111 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
2112 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
2114 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2116 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2118 + a->l0.bi16x16.cost_mv
2119 + a->l1.bi16x16.cost_mv;
2121 if( h->mb.b_chroma_me )
2122 a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
2124 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
2125 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
2127 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
2128 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
2129 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
2130 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
2131 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2132 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
2133 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2134 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
2135 + ref_costs + l0_mv_cost + l1_mv_cost;
2137 if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
2139 ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
2143 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2144 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
2145 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2146 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
2147 h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2148 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
2149 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2150 cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
2154 ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
2155 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
2156 int v_shift = CHROMA_V_SHIFT;
2158 if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
2160 int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2161 h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2162 h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
2165 h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
2166 h->mb.pic.i_stride[1], 16>>v_shift );
2168 if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
2170 int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
2171 h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2172 h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
2175 h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
2176 h->mb.pic.i_stride[1], 16>>v_shift );
2178 h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
2179 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2180 h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
2181 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
2183 cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
2184 + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
2188 if( cost00 < a->i_cost16x16bi )
2190 M32( a->l0.bi16x16.mv ) = 0;
2191 M32( a->l1.bi16x16.mv ) = 0;
2192 a->l0.bi16x16.cost_mv = l0_mv_cost;
2193 a->l1.bi16x16.cost_mv = l1_mv_cost;
2194 a->i_cost16x16bi = cost00;
2199 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
2200 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
2201 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2204 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2209 switch( h->mb.i_sub_partition[i] )
2212 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2215 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2216 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2219 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2220 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2223 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2224 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2225 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2226 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2229 x264_log( h, X264_LOG_ERROR, "internal error\n" );
2234 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2238 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2239 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2240 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2241 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2244 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2245 if( x264_mb_partition_listX_table[0][part] ) \
2247 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2248 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2252 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2253 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
2255 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2257 if( x264_mb_partition_listX_table[1][part] ) \
2259 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2260 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2264 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2265 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
2267 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2270 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2274 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2276 x264_mb_load_mv_direct8x8( h, i );
2279 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
2280 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
2281 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2286 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2289 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2291 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2293 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2295 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2299 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2301 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2302 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2304 /* early termination: if 16x16 chose ref 0, then evalute no refs older
2305 * than those used by the neighbors */
2306 #define CHECK_NEIGHBOUR(i)\
2308 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2309 if( ref > i_maxref[l] )\
2313 for( int l = 0; l < 2; l++ )
2315 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2316 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2317 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2320 CHECK_NEIGHBOUR( -8 - 1 );
2321 CHECK_NEIGHBOUR( -8 + 0 );
2322 CHECK_NEIGHBOUR( -8 + 2 );
2323 CHECK_NEIGHBOUR( -8 + 4 );
2324 CHECK_NEIGHBOUR( 0 - 1 );
2325 CHECK_NEIGHBOUR( 2*8 - 1 );
2329 /* XXX Needed for x264_mb_predict_mv */
2330 h->mb.i_partition = D_8x8;
2334 for( int i = 0; i < 4; i++ )
2340 intptr_t stride[2] = {8,8};
2343 m.i_pixel = PIXEL_8x8;
2344 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2346 for( int l = 0; l < 2; l++ )
2348 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2350 lX->me8x8[i].cost = INT_MAX;
2351 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2353 m.i_ref_cost = REF_COST( l, i_ref );
2355 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2357 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2358 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2359 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2360 m.cost += m.i_ref_cost;
2362 if( m.cost < lX->me8x8[i].cost )
2364 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2365 a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2368 /* save mv for predicting other partitions within this MB */
2369 CP32( lX->mvc[i_ref][i+1], m.mv );
2374 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2375 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
2376 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2377 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
2378 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2379 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2381 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2382 i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2383 + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2384 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2386 if( h->mb.b_chroma_me )
2388 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2389 i_part_cost_bi += i_chroma_cost;
2390 a->i_satd8x8[2][i] += i_chroma_cost;
2393 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2394 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2396 i_part_cost = a->l0.me8x8[i].cost;
2397 h->mb.i_sub_partition[i] = D_L0_8x8;
2398 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2399 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2400 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2401 a->i_cost8x8bi += i_part_cost;
2403 /* XXX Needed for x264_mb_predict_mv */
2404 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2408 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2411 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2414 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2415 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2416 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2418 /* XXX Needed for x264_mb_predict_mv */
2419 h->mb.i_partition = D_8x8;
2423 for( int i = 0; i < 4; i++ )
2428 int i_part_cost_bi = 0;
2429 intptr_t stride[2] = {8,8};
2432 for( int l = 0; l < 2; l++ )
2434 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2435 x264_me_t *m = &lX->me8x8[i];
2436 m->i_pixel = PIXEL_8x8;
2437 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2439 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2440 m->i_ref = lX->me16x16.i_ref;
2442 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2444 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2445 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2446 x264_me_search( h, m, &lX->me16x16.mv, 1 );
2447 a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2448 m->cost += m->i_ref_cost;
2450 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2452 /* save mv for predicting other partitions within this MB */
2453 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2456 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2457 m->mv[0], m->mv[1], 8, 8, x264_weight_none );
2458 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2460 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2461 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2462 i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2463 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2464 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2466 if( h->mb.b_chroma_me )
2468 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2469 i_part_cost_bi += i_chroma_cost;
2470 a->i_satd8x8[2][i] += i_chroma_cost;
2473 i_part_cost = a->l0.me8x8[i].cost;
2474 h->mb.i_sub_partition[i] = D_L0_8x8;
2475 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2476 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2477 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2478 a->i_cost8x8bi += i_part_cost;
2480 /* XXX Needed for x264_mb_predict_mv */
2481 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2485 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2488 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2490 ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
2491 ALIGNED_4( int16_t mvc[3][2] );
2493 h->mb.i_partition = D_16x8;
2494 a->i_cost16x8bi = 0;
2496 for( int i = 0; i < 2; i++ )
2499 int i_part_cost_bi = 0;
2500 intptr_t stride[2] = {16,16};
2503 m.i_pixel = PIXEL_16x8;
2504 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2506 for( int l = 0; l < 2; l++ )
2508 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2509 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2510 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2511 lX->me16x8[i].cost = INT_MAX;
2512 for( int j = 0; j < i_ref8s; j++ )
2514 int i_ref = ref8[j];
2515 m.i_ref_cost = REF_COST( l, i_ref );
2517 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2519 CP32( mvc[0], lX->mvc[i_ref][0] );
2520 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2521 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2523 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2524 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2525 x264_me_search( h, &m, mvc, 3 );
2526 m.cost += m.i_ref_cost;
2528 if( m.cost < lX->me16x8[i].cost )
2529 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2534 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2535 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
2536 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2537 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
2538 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2539 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2541 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2542 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2543 + a->l1.me16x8[i].i_ref_cost;
2545 if( h->mb.b_chroma_me )
2546 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2548 i_part_cost = a->l0.me16x8[i].cost;
2549 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2551 if( a->l1.me16x8[i].cost < i_part_cost )
2553 i_part_cost = a->l1.me16x8[i].cost;
2554 a->i_mb_partition16x8[i] = D_L1_8x8;
2556 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2558 i_part_cost = i_part_cost_bi;
2559 a->i_mb_partition16x8[i] = D_BI_8x8;
2561 a->i_cost16x8bi += i_part_cost;
2563 /* Early termination based on the current SATD score of partition[0]
2564 plus the estimated SATD score of partition[1] */
2565 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2566 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2568 a->i_cost16x8bi = COST_MAX;
2572 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2576 a->i_mb_type16x8 = B_L0_L0
2577 + (a->i_mb_partition16x8[0]>>2) * 3
2578 + (a->i_mb_partition16x8[1]>>2);
2579 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2582 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2584 ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2585 ALIGNED_4( int16_t mvc[3][2] );
2587 h->mb.i_partition = D_8x16;
2588 a->i_cost8x16bi = 0;
2590 for( int i = 0; i < 2; i++ )
2593 int i_part_cost_bi = 0;
2594 intptr_t stride[2] = {8,8};
2597 m.i_pixel = PIXEL_8x16;
2598 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2600 for( int l = 0; l < 2; l++ )
2602 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2603 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2604 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2605 lX->me8x16[i].cost = INT_MAX;
2606 for( int j = 0; j < i_ref8s; j++ )
2608 int i_ref = ref8[j];
2609 m.i_ref_cost = REF_COST( l, i_ref );
2611 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2613 CP32( mvc[0], lX->mvc[i_ref][0] );
2614 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2615 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2617 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2618 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2619 x264_me_search( h, &m, mvc, 3 );
2620 m.cost += m.i_ref_cost;
2622 if( m.cost < lX->me8x16[i].cost )
2623 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2628 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2629 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
2630 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2631 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
2632 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2634 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2635 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2636 + a->l1.me8x16[i].i_ref_cost;
2638 if( h->mb.b_chroma_me )
2639 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2641 i_part_cost = a->l0.me8x16[i].cost;
2642 a->i_mb_partition8x16[i] = D_L0_8x8;
2644 if( a->l1.me8x16[i].cost < i_part_cost )
2646 i_part_cost = a->l1.me8x16[i].cost;
2647 a->i_mb_partition8x16[i] = D_L1_8x8;
2649 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2651 i_part_cost = i_part_cost_bi;
2652 a->i_mb_partition8x16[i] = D_BI_8x8;
2654 a->i_cost8x16bi += i_part_cost;
2656 /* Early termination based on the current SATD score of partition[0]
2657 plus the estimated SATD score of partition[1] */
2658 if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2659 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
2661 a->i_cost8x16bi = COST_MAX;
2665 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2669 a->i_mb_type8x16 = B_L0_L0
2670 + (a->i_mb_partition8x16[0]>>2) * 3
2671 + (a->i_mb_partition8x16[1]>>2);
2672 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2675 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2677 int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
2679 h->mb.i_type = P_L0;
2680 if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
2682 h->mb.i_partition = D_16x16;
2683 x264_analyse_update_cache( h, a );
2684 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2687 if( a->l0.i_cost16x8 < thresh )
2689 h->mb.i_partition = D_16x8;
2690 x264_analyse_update_cache( h, a );
2691 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2694 a->l0.i_cost16x8 = COST_MAX;
2696 if( a->l0.i_cost8x16 < thresh )
2698 h->mb.i_partition = D_8x16;
2699 x264_analyse_update_cache( h, a );
2700 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2703 a->l0.i_cost8x16 = COST_MAX;
2705 if( a->l0.i_cost8x8 < thresh )
2707 h->mb.i_type = P_8x8;
2708 h->mb.i_partition = D_8x8;
2709 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2711 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2712 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2713 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2714 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2715 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2716 * for future blocks are those left over from previous RDO calls. */
2717 for( int i = 0; i < 4; i++ )
2719 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2720 int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
2721 int subtype, btype = D_L0_8x8;
2722 uint64_t bcost = COST_MAX64;
2723 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2726 if( costs[subtype] > sub8x8_thresh )
2728 h->mb.i_sub_partition[i] = subtype;
2729 x264_mb_cache_mv_p8x8( h, a, i );
2730 if( subtype == btype )
2732 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2733 COPY2_IF_LT( bcost, cost, btype, subtype );
2735 if( h->mb.i_sub_partition[i] != btype )
2737 h->mb.i_sub_partition[i] = btype;
2738 x264_mb_cache_mv_p8x8( h, a, i );
2743 x264_analyse_update_cache( h, a );
2744 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2747 a->l0.i_cost8x8 = COST_MAX;
2750 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2752 int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
2754 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2756 h->mb.i_type = B_DIRECT;
2757 /* Assumes direct/skip MC is still in fdec */
2758 /* Requires b-rdo to be done before intra analysis */
2759 h->mb.b_skip_mc = 1;
2760 x264_analyse_update_cache( h, a );
2761 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2762 h->mb.b_skip_mc = 0;
2765 //FIXME not all the update_cache calls are needed
2766 h->mb.i_partition = D_16x16;
2768 if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
2770 h->mb.i_type = B_L0_L0;
2771 x264_analyse_update_cache( h, a );
2772 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2776 if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
2778 h->mb.i_type = B_L1_L1;
2779 x264_analyse_update_cache( h, a );
2780 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2784 if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
2786 h->mb.i_type = B_BI_BI;
2787 x264_analyse_update_cache( h, a );
2788 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2792 if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
2794 h->mb.i_type = B_8x8;
2795 h->mb.i_partition = D_8x8;
2796 x264_analyse_update_cache( h, a );
2797 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2798 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2802 if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
2804 h->mb.i_type = a->i_mb_type16x8;
2805 h->mb.i_partition = D_16x8;
2806 x264_analyse_update_cache( h, a );
2807 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2811 if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
2813 h->mb.i_type = a->i_mb_type8x16;
2814 h->mb.i_partition = D_8x16;
2815 x264_analyse_update_cache( h, a );
2816 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2820 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2824 if( IS_INTRA(h->mb.i_type) )
2827 switch( h->mb.i_partition )
2830 if( h->mb.i_type == B_BI_BI )
2832 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2833 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2837 for( int i = 0; i < 2; i++ )
2838 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2840 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2841 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2845 for( int i = 0; i < 2; i++ )
2846 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2848 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2849 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2853 for( int i = 0; i < 4; i++ )
2854 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2856 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2857 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2863 static inline void x264_mb_analyse_transform( x264_t *h )
2865 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2867 /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
2870 int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
2871 int i_cost8 = 0, i_cost4 = 0;
2872 /* Not all platforms have a merged SATD function */
2873 if( h->pixf.sa8d_satd[PIXEL_16x16] )
2876 for( int p = 0; p < plane_count; p++ )
2878 cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2879 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2882 i_cost8 = (uint32_t)cost;
2883 i_cost4 = (uint32_t)(cost >> 32);
2887 for( int p = 0; p < plane_count; p++ )
2889 i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2890 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2891 i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
2892 h->mb.pic.p_fdec[p], FDEC_STRIDE );
2896 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2897 h->mb.b_skip_mc = 1;
2901 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2903 if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
2905 uint32_t subpart_bak = M32( h->mb.i_sub_partition );
2906 /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
2907 if( h->mb.i_type == P_8x8 )
2908 M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
2909 else if( !x264_transform_allowed[h->mb.i_type] )
2912 x264_analyse_update_cache( h, a );
2913 h->mb.b_transform_8x8 ^= 1;
2914 /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
2915 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2917 if( *i_rd >= i_rd8 )
2920 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2925 h->mb.b_transform_8x8 ^= 1;
2926 M32( h->mb.i_sub_partition ) = subpart_bak;
2931 /* Rate-distortion optimal QP selection.
2932 * FIXME: More than half of the benefit of this function seems to be
2933 * in the way it improves the coding of chroma DC (by decimating or
2934 * finding a better way to code a single DC coefficient.)
2935 * There must be a more efficient way to get that portion of the benefit
2936 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2938 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2940 int bcost, cost, failures, prevcost, origcost;
2941 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2942 int last_qp_tried = 0;
2943 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2944 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2946 /* If CBP is already zero, don't raise the quantizer any higher. */
2947 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2949 /* Without psy-RD, require monotonicity when moving quant away from previous
2950 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2951 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2952 * allow 2 failures when moving quant towards previous quant.
2953 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2954 int threshold = (!!h->mb.i_psy_rd);
2955 /* Raise the threshold for failures if we're moving towards the last QP. */
2956 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2957 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2959 h->mb.i_qp = orig_qp;
2961 prevcost = origcost;
2963 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2964 * (up to a point) will too. So, jump down to where the threshold will kick in
2965 * and check the QP there. If the CBP is still empty, skip the main loop.
2966 * If it isn't empty, we would have ended up having to check this QP anyways,
2967 * so as long as we store it for later lookup, we lose nothing. */
2968 int already_checked_qp = -1;
2969 int already_checked_cost = COST_MAX;
2970 if( direction == -1 )
2974 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
2975 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2976 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2977 if( !h->mb.cbp[h->mb.i_mb_xy] )
2979 /* If our empty-CBP block is lower QP than the last QP,
2980 * the last QP almost surely doesn't have a CBP either. */
2981 if( h->mb.i_last_qp > h->mb.i_qp )
2985 already_checked_qp = h->mb.i_qp;
2986 h->mb.i_qp = orig_qp;
2990 h->mb.i_qp += direction;
2991 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2993 if( h->mb.i_last_qp == h->mb.i_qp )
2995 if( h->mb.i_qp == already_checked_qp )
2996 cost = already_checked_cost;
2999 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3000 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3001 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3004 /* We can't assume that the costs are monotonic over QPs.
3005 * Tie case-as-failure seems to give better results. */
3006 if( cost < prevcost )
3012 if( failures > threshold )
3014 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
3016 h->mb.i_qp += direction;
3020 /* Always try the last block's QP. */
3021 if( !last_qp_tried )
3023 h->mb.i_qp = h->mb.i_last_qp;
3024 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3025 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3026 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
3030 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
3032 /* Check transform again; decision from before may no longer be optimal. */
3033 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
3034 x264_mb_transform_8x8_allowed( h ) )
3036 h->mb.b_transform_8x8 ^= 1;
3037 cost = x264_rd_cost_mb( h, a->i_lambda2 );
3039 h->mb.b_transform_8x8 ^= 1;
3043 /*****************************************************************************
3044 * x264_macroblock_analyse:
3045 *****************************************************************************/
3046 void x264_macroblock_analyse( x264_t *h )
3048 x264_mb_analysis_t analysis;
3049 int i_cost = COST_MAX;
3051 h->mb.i_qp = x264_ratecontrol_mb_qp( h );
3052 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
3053 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
3054 if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
3055 h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
3057 if( h->param.analyse.b_mb_info )
3058 h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
3059 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
3061 /*--------------------------- Do the analysis ---------------------------*/
3062 if( h->sh.i_type == SLICE_TYPE_I )
3065 if( analysis.i_mbrd )
3066 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3067 x264_mb_analyse_intra( h, &analysis, COST_MAX );
3068 if( analysis.i_mbrd )
3069 x264_intra_rd( h, &analysis, COST_MAX );
3071 i_cost = analysis.i_satd_i16x16;
3072 h->mb.i_type = I_16x16;
3073 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
3074 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
3075 if( analysis.i_satd_pcm < i_cost )
3076 h->mb.i_type = I_PCM;
3078 else if( analysis.i_mbrd >= 2 )
3079 x264_intra_rd_refine( h, &analysis );
3081 else if( h->sh.i_type == SLICE_TYPE_P )
3085 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
3087 analysis.b_try_skip = 0;
3088 if( analysis.b_force_intra )
3090 if( !h->param.analyse.b_psy )
3092 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3093 goto intra_analysis;
3098 /* Special fast-skip logic using information from mb_info. */
3099 if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
3101 if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
3102 h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
3104 h->mb.i_partition = D_16x16;
3105 /* Use the P-SKIP MV if we can... */
3106 if( !M32(h->mb.cache.pskip_mv) )
3109 h->mb.i_type = P_SKIP;
3111 /* Otherwise, just force a 16x16 block. */
3114 h->mb.i_type = P_L0;
3115 analysis.l0.me16x16.i_ref = 0;
3116 M32( analysis.l0.me16x16.mv ) = 0;
3120 /* Reset the information accordingly */
3121 else if( h->param.analyse.b_mb_info_update )
3122 h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
3125 int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
3126 /* If the current macroblock is off the frame, just skip it. */
3127 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
3129 /* Fast P_SKIP detection */
3130 else if( h->param.analyse.b_fast_pskip )
3133 // FIXME don't need to check this if the reference frame is done
3135 else if( h->param.analyse.i_subpel_refine >= 3 )
3136 analysis.b_try_skip = 1;
3137 else if( h->mb.i_mb_type_left[0] == P_SKIP ||
3138 h->mb.i_mb_type_top == P_SKIP ||
3139 h->mb.i_mb_type_topleft == P_SKIP ||
3140 h->mb.i_mb_type_topright == P_SKIP )
3141 b_skip = x264_macroblock_probe_pskip( h );
3145 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
3149 h->mb.i_type = P_SKIP;
3150 h->mb.i_partition = D_16x16;
3151 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
3153 /* Set up MVs for future predictors */
3154 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3155 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3159 const unsigned int flags = h->param.analyse.inter;
3162 int i_satd_inter, i_satd_intra;
3164 x264_mb_analyse_load_costs( h, &analysis );
3166 x264_mb_analyse_inter_p16x16( h, &analysis );
3168 if( h->mb.i_type == P_SKIP )
3170 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3171 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3175 if( flags & X264_ANALYSE_PSUB16x16 )
3177 if( h->param.analyse.b_mixed_references )
3178 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
3180 x264_mb_analyse_inter_p8x8( h, &analysis );
3183 /* Select best inter mode */
3185 i_partition = D_16x16;
3186 i_cost = analysis.l0.me16x16.cost;
3188 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3189 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
3192 i_partition = D_8x8;
3193 i_cost = analysis.l0.i_cost8x8;
3196 if( flags & X264_ANALYSE_PSUB8x8 )
3198 for( int i = 0; i < 4; i++ )
3200 x264_mb_analyse_inter_p4x4( h, &analysis, i );
3201 int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
3202 if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
3204 int i_cost8x8 = analysis.l0.i_cost4x4[i];
3205 h->mb.i_sub_partition[i] = D_L0_4x4;
3207 x264_mb_analyse_inter_p8x4( h, &analysis, i );
3208 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
3209 h->mb.i_sub_partition[i], D_L0_8x4 );
3211 x264_mb_analyse_inter_p4x8( h, &analysis, i );
3212 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
3213 h->mb.i_sub_partition[i], D_L0_4x8 );
3215 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
3217 x264_mb_cache_mv_p8x8( h, &analysis, i );
3219 analysis.l0.i_cost8x8 = i_cost;
3223 /* Now do 16x8/8x16 */
3224 int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
3225 if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
3226 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
3228 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
3229 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3230 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3232 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
3233 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
3235 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
3236 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
3237 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
3239 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
3240 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
3243 h->mb.i_partition = i_partition;
3246 //FIXME mb_type costs?
3247 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3251 else if( i_partition == D_16x16 )
3253 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3254 i_cost = analysis.l0.me16x16.cost;
3256 else if( i_partition == D_16x8 )
3258 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
3259 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
3260 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
3262 else if( i_partition == D_8x16 )
3264 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
3265 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
3266 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
3268 else if( i_partition == D_8x8 )
3271 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3273 switch( h->mb.i_sub_partition[i8x8] )
3276 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3277 i_cost += analysis.l0.me8x8[i8x8].cost;
3280 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3281 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3282 i_cost += analysis.l0.me8x4[i8x8][0].cost +
3283 analysis.l0.me8x4[i8x8][1].cost;
3286 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3287 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3288 i_cost += analysis.l0.me4x8[i8x8][0].cost +
3289 analysis.l0.me4x8[i8x8][1].cost;
3293 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3294 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3295 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3296 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3297 i_cost += analysis.l0.me4x4[i8x8][0].cost +
3298 analysis.l0.me4x4[i8x8][1].cost +
3299 analysis.l0.me4x4[i8x8][2].cost +
3300 analysis.l0.me4x4[i8x8][3].cost;
3303 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3309 if( h->mb.b_chroma_me )
3313 x264_mb_analyse_intra( h, &analysis, i_cost );
3314 x264_mb_analyse_intra_chroma( h, &analysis );
3318 x264_mb_analyse_intra_chroma( h, &analysis );
3319 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
3321 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3322 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3323 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3326 x264_mb_analyse_intra( h, &analysis, i_cost );
3328 i_satd_inter = i_cost;
3329 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3330 analysis.i_satd_i8x8,
3331 analysis.i_satd_i4x4 );
3333 if( analysis.i_mbrd )
3335 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3337 i_partition = D_16x16;
3338 i_cost = analysis.l0.i_rd16x16;
3339 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3340 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3341 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3342 h->mb.i_type = i_type;
3343 h->mb.i_partition = i_partition;
3344 if( i_cost < COST_MAX )
3345 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3346 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
3349 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3350 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3351 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3352 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3354 h->mb.i_type = i_type;
3356 if( analysis.b_force_intra && !IS_INTRA(i_type) )
3358 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3359 * it was an inter block. */
3360 x264_analyse_update_cache( h, &analysis );
3361 x264_macroblock_encode( h );
3362 for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
3363 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
3366 int height = 16 >> CHROMA_V_SHIFT;
3367 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
3368 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
3370 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3371 goto intra_analysis;
3374 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3376 if( IS_INTRA( h->mb.i_type ) )
3378 x264_intra_rd_refine( h, &analysis );
3380 else if( i_partition == D_16x16 )
3382 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3383 analysis.l0.me16x16.cost = i_cost;
3384 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3386 else if( i_partition == D_16x8 )
3388 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3389 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3390 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3391 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3392 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3393 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3395 else if( i_partition == D_8x16 )
3397 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3398 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3399 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3400 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3401 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3402 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3404 else if( i_partition == D_8x8 )
3406 x264_analyse_update_cache( h, &analysis );
3407 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3409 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3411 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3413 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3415 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3416 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3418 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3420 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3421 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3423 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3425 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3426 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3427 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3428 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3435 else if( h->sh.i_type == SLICE_TYPE_B )
3437 int i_bskip_cost = COST_MAX;
3440 if( analysis.i_mbrd )
3441 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3443 h->mb.i_type = B_SKIP;
3444 if( h->mb.b_direct_auto_write )
3446 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3447 for( int i = 0; i < 2; i++ )
3450 h->sh.b_direct_spatial_mv_pred ^= 1;
3451 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3452 if( analysis.b_direct_available )
3457 b_skip = x264_macroblock_probe_bskip( h );
3459 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3466 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3468 analysis.b_try_skip = 0;
3469 if( analysis.b_direct_available )
3471 if( !h->mb.b_direct_auto_write )
3473 /* If the current macroblock is off the frame, just skip it. */
3474 if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
3476 else if( analysis.i_mbrd )
3478 i_bskip_cost = ssd_mb( h );
3479 /* 6 = minimum cavlc cost of a non-skipped MB */
3480 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3482 else if( !h->mb.b_direct_auto_write )
3484 /* Conditioning the probe on neighboring block types
3485 * doesn't seem to help speed or quality. */
3486 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3487 if( h->param.analyse.i_subpel_refine < 3 )
3488 b_skip = analysis.b_try_skip;
3490 /* Set up MVs for future predictors */
3493 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3494 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3495 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3496 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3502 const unsigned int flags = h->param.analyse.inter;
3506 h->mb.b_skip_mc = 0;
3507 h->mb.i_type = B_DIRECT;
3509 x264_mb_analyse_load_costs( h, &analysis );
3511 /* select best inter mode */
3512 /* direct must be first */
3513 if( analysis.b_direct_available )
3514 x264_mb_analyse_inter_direct( h, &analysis );
3516 x264_mb_analyse_inter_b16x16( h, &analysis );
3518 if( h->mb.i_type == B_SKIP )
3520 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3521 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3522 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3523 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3528 i_partition = D_16x16;
3529 i_cost = analysis.l0.me16x16.cost;
3530 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3531 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3532 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3534 if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
3536 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3537 if( i_bskip_cost < analysis.i_rd16x16direct &&
3538 i_bskip_cost < analysis.i_rd16x16bi &&
3539 i_bskip_cost < analysis.l0.i_rd16x16 &&
3540 i_bskip_cost < analysis.l1.i_rd16x16 )
3542 h->mb.i_type = B_SKIP;
3543 x264_analyse_update_cache( h, &analysis );
3548 if( flags & X264_ANALYSE_BSUB16x16 )
3550 if( h->param.analyse.b_mixed_references )
3551 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3553 x264_mb_analyse_inter_b8x8( h, &analysis );
3555 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3557 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3558 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3559 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3560 for( int i = 0; i < 2; i++ )
3562 int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3563 int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3565 i_best_cost = COST_MAX;
3566 i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3567 i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3568 i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3569 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3570 + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3571 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3572 + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3573 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3574 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3575 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3576 analysis.i_cost_est16x8[i] = i_best_cost;
3579 i_best_cost = COST_MAX;
3580 i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3581 i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3582 i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3583 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3584 + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3585 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3586 + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3587 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3588 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3589 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3590 analysis.i_cost_est8x16[i] = i_best_cost;
3592 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3593 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3594 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3595 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3596 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3597 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3599 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3600 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3601 if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3603 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3604 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3606 if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
3608 x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3609 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3611 if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
3613 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3614 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3618 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3623 else if( i_partition == D_16x16 )
3625 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3626 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3627 if( i_type == B_L0_L0 )
3629 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3630 i_cost = analysis.l0.me16x16.cost
3631 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3633 else if( i_type == B_L1_L1 )
3635 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3636 i_cost = analysis.l1.me16x16.cost
3637 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3639 else if( i_type == B_BI_BI )
3641 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3642 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3645 else if( i_partition == D_16x8 )
3647 for( int i = 0; i < 2; i++ )
3649 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3650 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3651 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3652 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3655 else if( i_partition == D_8x16 )
3657 for( int i = 0; i < 2; i++ )
3659 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3660 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3661 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3662 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3665 else if( i_partition == D_8x8 )
3667 for( int i = 0; i < 4; i++ )
3670 int i_part_cost_old;
3672 int i_part_type = h->mb.i_sub_partition[i];
3673 int b_bidir = (i_part_type == D_BI_8x8);
3675 if( i_part_type == D_DIRECT_8x8 )
3677 if( x264_mb_partition_listX_table[0][i_part_type] )
3679 m = &analysis.l0.me8x8[i];
3680 i_part_cost_old = m->cost;
3681 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3682 m->cost -= i_type_cost;
3683 x264_me_refine_qpel( h, m );
3685 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3687 if( x264_mb_partition_listX_table[1][i_part_type] )
3689 m = &analysis.l1.me8x8[i];
3690 i_part_cost_old = m->cost;
3691 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3692 m->cost -= i_type_cost;
3693 x264_me_refine_qpel( h, m );
3695 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3697 /* TODO: update mvp? */
3701 i_satd_inter = i_cost;
3703 if( analysis.i_mbrd )
3705 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3707 i_cost = i_bskip_cost;
3708 i_partition = D_16x16;
3709 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3710 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3711 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3712 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3713 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3714 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3715 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3717 h->mb.i_type = i_type;
3718 h->mb.i_partition = i_partition;
3721 if( h->mb.b_chroma_me )
3725 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3726 x264_mb_analyse_intra_chroma( h, &analysis );
3730 x264_mb_analyse_intra_chroma( h, &analysis );
3731 x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
3733 analysis.i_satd_i16x16 += analysis.i_satd_chroma;
3734 analysis.i_satd_i8x8 += analysis.i_satd_chroma;
3735 analysis.i_satd_i4x4 += analysis.i_satd_chroma;
3738 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3740 if( analysis.i_mbrd )
3742 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3743 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
3746 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3747 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3748 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3749 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3751 h->mb.i_type = i_type;
3752 h->mb.i_partition = i_partition;
3754 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3755 x264_intra_rd_refine( h, &analysis );
3756 if( h->mb.i_subpel_refine >= 5 )
3757 x264_refine_bidir( h, &analysis );
3759 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3762 x264_analyse_update_cache( h, &analysis );
3764 if( i_partition == D_16x16 )
3766 if( i_type == B_L0_L0 )
3768 analysis.l0.me16x16.cost = i_cost;
3769 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3771 else if( i_type == B_L1_L1 )
3773 analysis.l1.me16x16.cost = i_cost;
3774 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3776 else if( i_type == B_BI_BI )
3778 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3779 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3782 else if( i_partition == D_16x8 )
3784 for( int i = 0; i < 2; i++ )
3786 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3787 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3788 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3789 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3790 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3791 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3793 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3794 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3798 else if( i_partition == D_8x16 )
3800 for( int i = 0; i < 2; i++ )
3802 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3803 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3804 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3805 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3806 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3807 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3809 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3810 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3814 else if( i_partition == D_8x8 )
3816 for( int i = 0; i < 4; i++ )
3818 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3819 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3820 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3821 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3822 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3824 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3825 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3833 x264_analyse_update_cache( h, &analysis );
3835 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3836 * without realizing it. Check for this and account for it if necessary. */
3837 if( analysis.i_mbrd >= 2 )
3839 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3840 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3841 int list = check_mv_lists[h->mb.i_type] - 1;
3842 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3843 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3844 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3845 h->mb.i_partition = D_16x16;
3848 if( !analysis.i_mbrd )
3849 x264_mb_analyse_transform( h );
3851 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3852 x264_mb_analyse_qp_rd( h, &analysis );
3854 h->mb.b_trellis = h->param.analyse.i_trellis;
3855 h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3857 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3858 x264_psy_trellis_init( h, 0 );
3859 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3860 h->mb.i_skip_intra = 0;
3863 /*-------------------- Update MB from the analysis ----------------------*/
3864 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3866 switch( h->mb.i_type )
3869 for( int i = 0; i < 16; i++ )
3870 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3872 x264_mb_analyse_intra_chroma( h, a );
3875 for( int i = 0; i < 4; i++ )
3876 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3878 x264_mb_analyse_intra_chroma( h, a );
3881 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3882 x264_mb_analyse_intra_chroma( h, a );
3889 switch( h->mb.i_partition )
3892 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3893 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3897 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3898 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3899 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3900 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3904 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3905 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3906 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3907 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3911 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3917 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3918 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3919 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3920 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3921 for( int i = 0; i < 4; i++ )
3922 x264_mb_cache_mv_p8x8( h, a, i );
3927 h->mb.i_partition = D_16x16;
3928 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3929 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3935 h->mb.i_partition = h->mb.cache.direct_partition;
3936 x264_mb_load_mv_direct8x8( h, 0 );
3937 x264_mb_load_mv_direct8x8( h, 1 );
3938 x264_mb_load_mv_direct8x8( h, 2 );
3939 x264_mb_load_mv_direct8x8( h, 3 );
3943 /* optimize: cache might not need to be rewritten */
3944 for( int i = 0; i < 4; i++ )
3945 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3948 default: /* the rest of the B types */
3949 switch( h->mb.i_partition )
3952 switch( h->mb.i_type )
3955 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3956 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3958 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3959 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3960 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3963 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3964 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3965 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3967 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3968 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3971 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3972 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3974 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3975 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3980 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3981 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3984 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3985 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3988 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3994 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3996 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3999 int ref = h->mb.cache.ref[l][x264_scan8[0]];
4002 completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
4003 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
4005 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
4006 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
4007 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
4008 h->mb.cache.mv[l][x264_scan8[15]][0],
4009 h->mb.cache.mv[l][x264_scan8[15]][1] );
4010 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
4011 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
4012 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
4013 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
4014 x264_mb_analyse_intra( h, a, COST_MAX );
4015 h->mb.i_type = I_16x16;
4016 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
4017 x264_mb_analyse_intra_chroma( h, a );
4024 #include "slicetype.c"