1 /*****************************************************************************
2 * analyse.c: macroblock analysis
3 *****************************************************************************
4 * Copyright (C) 2003-2011 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
28 #define _ISOC99_SOURCE
30 #include "common/common.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
87 int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
92 int i_satd_i16x16_dir[7];
97 int i_satd_i8x8_dir[12][4];
101 int i_predict4x4[16];
106 int i_satd_i8x8chroma;
107 int i_satd_i8x8chroma_dir[7];
108 int i_predict8x8chroma;
110 /* II: Inter part P/B frame */
111 x264_mb_analysis_list_t l0;
112 x264_mb_analysis_list_t l1;
114 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115 int i_cost16x16direct;
117 int i_cost8x8direct[4];
118 int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
119 int i_cost_est16x8[2]; /* Per-partition estimated cost */
120 int i_cost_est8x16[2];
129 int i_mb_partition16x8[2]; /* mb_partition_e */
130 int i_mb_partition8x16[2];
131 int i_mb_type16x8; /* mb_class_e */
134 int b_direct_available;
136 } x264_mb_analysis_t;
138 /* lambda = pow(2,qp/6-2) */
139 const uint16_t x264_lambda_tab[QP_MAX_MAX+1] =
141 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */
142 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */
143 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */
144 4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */
145 10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */
146 25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */
147 64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */
148 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
149 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
150 1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
151 2048,2299, /* 80-81 */
154 /* lambda2 = pow(lambda,2) * .9 * 256 */
155 /* Capped to avoid overflow */
156 const int x264_lambda2_tab[QP_MAX_MAX+1] =
158 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */
159 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */
160 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */
161 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */
162 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */
163 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */
164 943718, 1189010, 1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
165 5992238, 7549747, 9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
166 38048341, 47937906, 60397977, 76096683, 95875813,120795955, /* 64-69 */
167 134217727,134217727,134217727,134217727,134217727,134217727, /* 70-75 */
168 134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */
171 const uint8_t x264_exp2_lut[64] =
173 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
174 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
175 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
176 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
179 const float x264_log2_lut[128] =
181 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
182 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
183 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
184 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
185 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
186 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
187 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
188 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
189 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
190 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
191 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
192 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
193 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
194 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
195 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
196 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
199 /* Avoid an int/float conversion. */
200 const float x264_log2_lz_lut[32] =
202 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
205 // should the intra and inter lambdas be different?
206 // I'm just matching the behaviour of deadzone quant.
207 static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] =
209 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
211 46, 58, 73, 92, 117, 147,
212 185, 233, 294, 370, 466, 587,
213 740, 932, 1174, 1480, 1864, 2349,
214 2959, 3728, 4697, 5918, 7457, 9395,
215 11837, 14914, 18790, 23674, 29828, 37581,
216 47349, 59656, 75163, 94699, 119313, 150326,
217 189399, 238627, 300652, 378798, 477255, 601304,
218 757596, 954511, 1202608, 1515192, 1909022, 2405217,
219 3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
220 12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
221 48486154, 61088726, 76966972, 96972308,
222 122177453,134217727,134217727,134217727,134217727,134217727,
223 134217727,134217727,134217727,134217727,134217727,134217727,
225 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
227 27, 34, 43, 54, 68, 86,
228 108, 136, 172, 216, 273, 343,
229 433, 545, 687, 865, 1090, 1374,
230 1731, 2180, 2747, 3461, 4361, 5494,
231 6922, 8721, 10988, 13844, 17442, 21976,
232 27688, 34885, 43953, 55377, 69771, 87906,
233 110755, 139543, 175813, 221511, 279087, 351627,
234 443023, 558174, 703255, 886046, 1116348, 1406511,
235 1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
236 7088374, 8930791, 11252092, 14176748, 17861583, 22504184,
237 28353495, 35723165, 45008368, 56706990,
238 71446330, 90016736,113413980,134217727,134217727,134217727,
239 134217727,134217727,134217727,134217727,134217727,134217727,
240 134217727,134217727,134217727,134217727,134217727,134217727,
244 #define MAX_CHROMA_LAMBDA_OFFSET 36
245 static const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] =
247 16, 20, 25, 32, 40, 50,
248 64, 80, 101, 128, 161, 203,
249 256, 322, 406, 512, 645, 812,
250 1024, 1290, 1625, 2048, 2580, 3250,
251 4096, 5160, 6501, 8192, 10321, 13003,
252 16384, 20642, 26007, 32768, 41285, 52015,
256 /* TODO: calculate CABAC costs */
257 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
259 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
261 static const uint8_t i_mb_b16x8_cost_table[17] =
263 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
265 static const uint8_t i_sub_mb_b_cost_table[13] =
267 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
269 static const uint8_t i_sub_mb_p_cost_table[4] =
274 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
276 static uint16_t x264_cost_ref[QP_MAX+1][3][33];
277 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
279 float *x264_analyse_prepare_costs( x264_t *h )
281 float *logs = x264_malloc( (2*4*2048+1)*sizeof(float) );
285 for( int i = 1; i <= 2*4*2048; i++ )
286 logs[i] = log2f(i+1)*2 + 1.718f;
290 int x264_analyse_init_costs( x264_t *h, float *logs, int qp )
292 int lambda = x264_lambda_tab[qp];
295 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
296 CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
297 h->cost_mv[qp] += 2*4*2048;
298 for( int i = 0; i <= 2*4*2048; i++ )
301 h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
303 x264_pthread_mutex_lock( &cost_ref_mutex );
304 for( int i = 0; i < 3; i++ )
305 for( int j = 0; j < 33; j++ )
306 x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
307 x264_pthread_mutex_unlock( &cost_ref_mutex );
308 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
310 for( int j = 0; j < 4; j++ )
312 CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
313 h->cost_mv_fpel[qp][j] += 2*2048;
314 for( int i = -2*2048; i < 2*2048; i++ )
315 h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
323 void x264_analyse_free_costs( x264_t *h )
325 for( int i = 0; i < QP_MAX+1; i++ )
328 x264_free( h->cost_mv[i] - 2*4*2048 );
329 if( h->cost_mv_fpel[i][0] )
330 for( int j = 0; j < 4; j++ )
331 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
335 void x264_analyse_weight_frame( x264_t *h, int end )
337 for( int j = 0; j < h->i_ref[0]; j++ )
339 if( h->sh.weight[j][0].weightfn )
341 x264_frame_t *frame = h->fref[0][j];
342 int width = frame->i_width[0] + 2*PADH;
343 int i_padv = PADV << h->param.b_interlaced;
345 pixel *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
346 height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
347 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
348 h->fenc->i_lines_weighted += height;
350 for( int k = j; k < h->i_ref[0]; k++ )
351 if( h->sh.weight[k][0].weightfn )
353 pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
354 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
355 src + offset, frame->i_stride[0],
356 width, height, &h->sh.weight[k][0] );
363 /* initialize an array of lambda*nbits for all possible mvs */
364 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
366 a->p_cost_mv = h->cost_mv[a->i_qp];
367 a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
368 a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
371 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
373 int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
374 a->i_lambda = x264_lambda_tab[qp];
375 a->i_lambda2 = x264_lambda2_tab[qp];
377 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
378 if( h->param.analyse.i_trellis )
380 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
381 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
382 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
383 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
385 h->mb.i_psy_rd_lambda = a->i_lambda;
386 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
387 int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
388 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
390 if( qp > QP_MAX_SPEC )
392 h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
393 h->nr_residual_sum = h->nr_residual_sum_buf[1];
394 h->nr_count = h->nr_count_buf[1];
395 h->mb.b_noise_reduction = 1;
396 qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
400 h->nr_offset = h->nr_offset_denoise;
401 h->nr_residual_sum = h->nr_residual_sum_buf[0];
402 h->nr_count = h->nr_count_buf[0];
403 h->mb.b_noise_reduction = 0;
406 a->i_qp = h->mb.i_qp = qp;
407 h->mb.i_chroma_qp = h->chroma_qp_table[qp];
410 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
412 int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
414 /* mbrd == 1 -> RD mode decision */
415 /* mbrd == 2 -> RD refinement */
416 /* mbrd == 3 -> QPRD */
417 a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
418 h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
420 x264_mb_analyse_init_qp( h, a, qp );
422 h->mb.b_transform_8x8 = 0;
428 a->i_satd_i8x8chroma = COST_MAX;
430 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
431 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
434 a->b_avoid_topright = 0;
436 h->mb.b_lossless ? 0 :
438 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
440 /* II: Inter part P/B frame */
441 if( h->sh.i_type != SLICE_TYPE_I )
443 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
444 // limit motion search to a slightly smaller range than the theoretical limit,
445 // since the search may go a few iterations past its given range
446 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
448 /* Calculate max allowed MV range */
449 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
450 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
451 h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
452 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
453 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
454 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
456 int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
457 int max_mv = max_x - 4*16*h->mb.i_mb_x;
458 /* If we're left of the refresh bar, don't reference right of it. */
459 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
460 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
462 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
463 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
464 if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & h->param.b_interlaced) )
466 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
467 int thread_mvy_range = i_fmv_range;
469 if( h->i_thread_frames > 1 )
471 int pix_y = (h->mb.i_mb_y | h->param.b_interlaced) * 16;
472 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
473 for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
474 for( int j = 0; j < h->i_ref[i]; j++ )
476 x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
477 thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
480 if( h->param.b_deterministic )
481 thread_mvy_range = h->param.analyse.i_mv_range_thread;
482 if( h->param.b_interlaced )
483 thread_mvy_range >>= 1;
485 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
488 if( h->param.b_interlaced )
490 /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
491 for( int i = 0; i < 3; i++ )
494 mb_y = (h->mb.i_mb_y >> j) + (i == 1);
495 h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
496 h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
497 h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
498 h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
499 h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
500 h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
501 h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
506 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
507 h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
508 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
509 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
510 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
511 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
512 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
515 if( h->param.b_interlaced )
517 int i = h->mb.b_interlaced ? 2 : h->mb.i_mb_y&1;
518 h->mb.mv_min[1] = h->mb.mv_miny_row[i];
519 h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
520 h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
521 h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
522 h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i];
523 h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i];
531 a->l0.i_cost8x16 = COST_MAX;
532 if( h->sh.i_type == SLICE_TYPE_B )
537 a->i_cost8x8direct[0] =
538 a->i_cost8x8direct[1] =
539 a->i_cost8x8direct[2] =
540 a->i_cost8x8direct[3] =
549 a->i_cost16x16direct =
552 a->i_cost8x16bi = COST_MAX;
554 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
555 for( int i = 0; i < 4; i++ )
559 a->l0.i_cost4x8[i] = COST_MAX;
562 /* Fast intra decision */
563 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
565 /* Always run in fast-intra mode for subme < 3 */
566 if( h->mb.i_subpel_refine > 2 &&
567 ( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
568 IS_INTRA( h->mb.i_mb_type_top ) ||
569 IS_INTRA( h->mb.i_mb_type_topleft ) ||
570 IS_INTRA( h->mb.i_mb_type_topright ) ||
571 (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
572 (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) )
573 { /* intra is likely */ }
580 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
581 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
583 a->b_force_intra = 1;
585 a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
588 a->b_force_intra = 0;
592 /* Prediction modes allowed for various combinations of neighbors. */
593 /* Terminated by a -1. */
594 /* In order, no neighbors, left, top, top/left, top/left/topleft */
595 static const int8_t i16x16_mode_available[5][5] =
597 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
598 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
599 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
600 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
601 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
604 static const int8_t i8x8chroma_mode_available[5][5] =
606 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
607 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
608 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
609 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
610 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
613 static const int8_t i4x4_mode_available[2][5][10] =
616 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
617 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
618 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
619 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
620 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
623 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
624 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
625 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
626 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
627 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1},
631 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
633 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
634 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
635 return i16x16_mode_available[idx];
638 static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
640 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
641 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
642 return i8x8chroma_mode_available[idx];
645 static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
647 int avoid_topright = force_intra && (i&1);
648 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
649 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
650 return i4x4_mode_available[avoid_topright][idx];
653 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
655 int avoid_topright = force_intra && ((i&5) == 5);
656 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
657 idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
658 return i4x4_mode_available[avoid_topright][idx];
661 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
662 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
664 ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
666 if( do_both_dct || h->mb.b_transform_8x8 )
667 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
668 if( do_both_dct || !h->mb.b_transform_8x8 )
669 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
672 /* Reset fenc satd scores cache for psy RD */
673 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
675 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
676 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
677 if( !h->mb.i_psy_rd )
679 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
680 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
682 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
685 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
687 if( a->i_satd_i8x8chroma < COST_MAX )
690 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
692 /* 8x8 prediction selection for chroma */
693 if( predict_mode[3] >= 0 && !h->mb.b_lossless )
695 int satdu[4], satdv[4];
696 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
697 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
698 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
699 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
700 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
701 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
703 for( ; *predict_mode >= 0; predict_mode++ )
705 int i_mode = *predict_mode;
706 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
708 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
709 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
714 for( ; *predict_mode >= 0; predict_mode++ )
717 int i_mode = *predict_mode;
719 /* we do the prediction */
720 if( h->mb.b_lossless )
721 x264_predict_lossless_8x8_chroma( h, i_mode );
724 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
725 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
728 /* we calculate the cost */
729 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
730 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
731 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
733 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
734 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
738 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
741 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
743 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
744 pixel *p_src = h->mb.pic.p_fenc[0];
745 pixel *p_dst = h->mb.pic.p_fdec[0];
746 static const int8_t intra_analysis_shortcut[2][2][2][5] =
748 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
749 {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
750 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
751 {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
752 {{{I_PRED_4x4_HU, -1, -1, -1, -1},
753 {-1, -1, -1, -1, -1}},
754 {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
755 {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
759 int lambda = a->i_lambda;
761 /*---------------- Try all mode and calculate their score ---------------*/
763 /* 16x16 prediction selection */
764 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
766 /* Not heavily tuned */
767 static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
768 int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
770 if( !h->mb.b_lossless && predict_mode[3] >= 0 )
772 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
773 a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
774 a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
775 a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
776 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
777 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
778 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
780 /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
781 if( a->i_satd_i16x16 <= i16x16_thresh )
783 h->predict_16x16[I_PRED_16x16_P]( p_dst );
784 a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
785 a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
786 COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
791 for( ; *predict_mode >= 0; predict_mode++ )
794 int i_mode = *predict_mode;
796 if( h->mb.b_lossless )
797 x264_predict_lossless_16x16( h, i_mode );
799 h->predict_16x16[i_mode]( p_dst );
801 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
802 lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
803 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
804 a->i_satd_i16x16_dir[i_mode] = i_satd;
808 if( h->sh.i_type == SLICE_TYPE_B )
809 /* cavlc mb type prefix */
810 a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
812 if( a->i_satd_i16x16 > i16x16_thresh )
815 /* 8x8 prediction selection */
816 if( flags & X264_ANALYSE_I8x8 )
818 ALIGNED_ARRAY_16( pixel, edge,[33] );
819 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
820 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
822 // FIXME some bias like in i4x4?
823 int i_cost = lambda * 4; /* base predmode costs */
824 h->mb.i_cbp_luma = 0;
826 if( h->sh.i_type == SLICE_TYPE_B )
827 i_cost += lambda * i_mb_b_cost_table[I_8x8];
829 for( idx = 0;; idx++ )
833 pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
834 pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
835 int i_best = COST_MAX;
836 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
838 predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
839 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
841 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
844 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
845 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
846 satd[i_pred_mode] -= 3 * lambda;
847 for( int i = 2; i >= 0; i-- )
850 a->i_satd_i8x8_dir[i][idx] = cost + 4 * lambda;
851 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
854 /* Take analysis shortcuts: don't analyse modes that are too
855 * far away direction-wise from the favored mode. */
856 if( a->i_mbrd < 1 + a->b_fast_intra )
857 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
862 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
865 int i_mode = *predict_mode;
867 if( h->mb.b_lossless )
868 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
870 h->predict_8x8[i_mode]( p_dst_by, edge );
872 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
873 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
874 i_satd -= 3 * lambda;
876 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
877 a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * lambda;
879 i_cost += i_best + 3 * lambda;
881 if( idx == 3 || i_cost > i_satd_thresh )
884 /* we need to encode this block now (for next ones) */
885 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
886 x264_mb_encode_i8x8( h, idx, a->i_qp );
888 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
893 a->i_satd_i8x8 = i_cost;
894 if( h->mb.i_skip_intra )
896 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
897 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
898 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
899 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
900 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
901 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
902 if( h->mb.i_skip_intra == 2 )
903 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
908 static const uint16_t cost_div_fix8[3] = {1024,512,341};
909 a->i_satd_i8x8 = COST_MAX;
910 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
912 /* Not heavily tuned */
913 static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
914 if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
918 /* 4x4 prediction selection */
919 if( flags & X264_ANALYSE_I4x4 )
921 int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
922 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
923 h->mb.i_cbp_luma = 0;
926 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
928 if( h->sh.i_type == SLICE_TYPE_B )
929 i_cost += lambda * i_mb_b_cost_table[I_4x4];
931 for( idx = 0;; idx++ )
933 pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
934 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
935 int i_best = COST_MAX;
936 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
938 predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
940 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
941 /* emulate missing topright samples */
942 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
944 if( !h->mb.b_lossless && predict_mode[5] >= 0 )
947 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
948 int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
949 satd[i_pred_mode] -= 3 * lambda;
950 for( int i = 2; i >= 0; i-- )
951 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
953 /* Take analysis shortcuts: don't analyse modes that are too
954 * far away direction-wise from the favored mode. */
955 if( a->i_mbrd < 1 + a->b_fast_intra )
956 predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
963 for( ; *predict_mode >= 0; predict_mode++ )
966 int i_mode = *predict_mode;
968 if( h->mb.b_lossless )
969 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
971 h->predict_4x4[i_mode]( p_dst_by );
973 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
974 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
976 i_satd -= lambda * 3;
980 a->i_predict4x4[idx] = i_mode;
985 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
988 i_cost += i_best + 3 * lambda;
990 if( i_cost > i_satd_thresh || idx == 15 )
993 /* we need to encode this block now (for next ones) */
994 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
995 x264_mb_encode_i4x4( h, idx, a->i_qp );
997 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1001 a->i_satd_i4x4 = i_cost;
1002 if( h->mb.i_skip_intra )
1004 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
1005 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
1006 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
1007 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
1008 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
1009 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
1010 if( h->mb.i_skip_intra == 2 )
1011 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
1015 a->i_satd_i4x4 = COST_MAX;
1019 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
1021 if( a->i_satd_i16x16 <= i_satd_thresh )
1023 h->mb.i_type = I_16x16;
1024 x264_analyse_update_cache( h, a );
1025 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1028 a->i_satd_i16x16 = COST_MAX;
1030 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
1032 h->mb.i_type = I_4x4;
1033 x264_analyse_update_cache( h, a );
1034 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
1037 a->i_satd_i4x4 = COST_MAX;
1039 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
1041 h->mb.i_type = I_8x8;
1042 x264_analyse_update_cache( h, a );
1043 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1044 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
1047 a->i_satd_i8x8 = COST_MAX;
1050 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
1052 pixel *p_dst = h->mb.pic.p_fdec[0];
1053 uint64_t i_satd, i_best;
1054 h->mb.i_skip_intra = 0;
1056 if( h->mb.i_type == I_16x16 )
1058 int old_pred_mode = a->i_predict16x16;
1059 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
1060 int i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
1061 i_best = a->i_satd_i16x16;
1062 for( ; *predict_mode >= 0; predict_mode++ )
1064 int i_mode = *predict_mode;
1065 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1067 h->mb.i_intra16x16_pred_mode = i_mode;
1068 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1069 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1073 /* RD selection for chroma prediction */
1074 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
1075 if( predict_mode[1] >= 0 )
1077 int8_t predict_mode_sorted[4];
1079 int i_thresh = a->i_satd_i8x8chroma * 5/4;
1081 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
1083 int i_mode = *predict_mode;
1084 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
1085 predict_mode_sorted[i_max++] = i_mode;
1090 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1091 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1092 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1093 * coefs for the current chroma mode are still around, so we only
1094 * have to recount the bits. */
1095 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1096 for( int i = 0; i < i_max; i++ )
1098 int i_mode = predict_mode_sorted[i];
1099 if( h->mb.b_lossless )
1100 x264_predict_lossless_8x8_chroma( h, i_mode );
1103 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
1104 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
1106 /* if we've already found a mode that needs no residual, then
1107 * probably any mode with a residual will be worse.
1108 * so avoid dct on the remaining modes to improve speed. */
1109 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1110 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1112 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1113 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1117 if( h->mb.i_type == I_4x4 )
1119 pixel4 pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1121 for( int idx = 0; idx < 16; idx++ )
1123 pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1124 i_best = COST_MAX64;
1126 predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
1128 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1129 /* emulate missing topright samples */
1130 MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
1132 for( ; *predict_mode >= 0; predict_mode++ )
1134 int i_mode = *predict_mode;
1135 if( h->mb.b_lossless )
1136 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1138 h->predict_4x4[i_mode]( p_dst_by );
1139 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1141 if( i_best > i_satd )
1143 a->i_predict4x4[idx] = i_mode;
1145 pels[0] = MPIXEL_X4( p_dst_by+0*FDEC_STRIDE );
1146 pels[1] = MPIXEL_X4( p_dst_by+1*FDEC_STRIDE );
1147 pels[2] = MPIXEL_X4( p_dst_by+2*FDEC_STRIDE );
1148 pels[3] = MPIXEL_X4( p_dst_by+3*FDEC_STRIDE );
1149 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1153 MPIXEL_X4( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1154 MPIXEL_X4( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1155 MPIXEL_X4( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1156 MPIXEL_X4( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1157 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1159 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1162 else if( h->mb.i_type == I_8x8 )
1164 ALIGNED_ARRAY_16( pixel, edge,[33] );
1165 for( int idx = 0; idx < 4; idx++ )
1167 pixel4 pels_h[2] = {0};
1168 pixel pels_v[7] = {0};
1169 uint16_t i_nnz[2] = {0}; //shut up gcc
1171 int cbp_luma_new = 0;
1172 int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1174 i_best = COST_MAX64;
1177 int s8 = X264_SCAN8_0 + 2*x + 16*y;
1179 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1180 predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
1181 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1183 for( ; *predict_mode >= 0; predict_mode++ )
1185 int i_mode = *predict_mode;
1186 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1189 if( h->mb.b_lossless )
1190 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1192 h->predict_8x8[i_mode]( p_dst_by, edge );
1193 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1194 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1196 if( i_best > i_satd )
1198 a->i_predict8x8[idx] = i_mode;
1199 cbp_luma_new = h->mb.i_cbp_luma;
1202 pels_h[0] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 );
1203 pels_h[1] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 );
1205 for( int j = 0; j < 7; j++ )
1206 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1207 i_nnz[0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8] );
1208 i_nnz[1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8] );
1211 a->i_cbp_i8x8_luma = cbp_luma_new;
1212 MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 ) = pels_h[0];
1213 MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 ) = pels_h[1];
1215 for( int j = 0; j < 7; j++ )
1216 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1217 M16( &h->mb.cache.non_zero_count[s8 + 0*8] ) = i_nnz[0];
1218 M16( &h->mb.cache.non_zero_count[s8 + 1*8] ) = i_nnz[1];
1220 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1225 #define LOAD_FENC( m, src, xoff, yoff) \
1226 (m)->p_cost_mv = a->p_cost_mv; \
1227 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1228 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1229 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1230 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1231 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1233 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1234 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1235 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1236 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1237 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1238 (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \
1239 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1240 (m)->weight = weight_none; \
1243 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1244 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1245 (m)->weight = h->sh.weight[i_ref];
1247 #define REF_COST(list, ref) \
1248 (a->p_cost_ref[list][ref])
1250 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1254 ALIGNED_4( int16_t mvc[8][2] );
1255 int i_halfpel_thresh = INT_MAX;
1256 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1258 /* 16x16 Search on all ref frame */
1259 m.i_pixel = PIXEL_16x16;
1260 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1262 a->l0.me16x16.cost = INT_MAX;
1263 for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1265 m.i_ref_cost = REF_COST( 0, i_ref );
1266 i_halfpel_thresh -= m.i_ref_cost;
1268 /* search with ref */
1269 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1270 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1272 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1274 if( h->mb.ref_blind_dupe == i_ref )
1276 CP32( m.mv, a->l0.mvc[0][0] );
1277 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1281 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1282 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1285 /* save mv for predicting neighbors */
1286 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1287 CP32( a->l0.mvc[i_ref][0], m.mv );
1289 /* early termination
1290 * SSD threshold would probably be better than SATD */
1293 && m.cost-m.cost_mv < 300*a->i_lambda
1294 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1295 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1296 && x264_macroblock_probe_pskip( h ) )
1298 h->mb.i_type = P_SKIP;
1299 x264_analyse_update_cache( h, a );
1300 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1304 m.cost += m.i_ref_cost;
1305 i_halfpel_thresh += m.i_ref_cost;
1307 if( m.cost < a->l0.me16x16.cost )
1308 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1311 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1312 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1314 h->mb.i_type = P_L0;
1317 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1318 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1320 h->mb.i_partition = D_16x16;
1321 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1322 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1323 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1324 h->mb.i_type = P_SKIP;
1329 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1332 pixel **p_fenc = h->mb.pic.p_fenc;
1333 int i_maxref = h->mb.pic.i_fref[0]-1;
1335 h->mb.i_partition = D_8x8;
1337 #define CHECK_NEIGHBOUR(i)\
1339 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1340 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1344 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1345 * than those used by the neighbors */
1346 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1347 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
1350 CHECK_NEIGHBOUR( -8 - 1 );
1351 CHECK_NEIGHBOUR( -8 + 0 );
1352 CHECK_NEIGHBOUR( -8 + 2 );
1353 CHECK_NEIGHBOUR( -8 + 4 );
1354 CHECK_NEIGHBOUR( 0 - 1 );
1355 CHECK_NEIGHBOUR( 2*8 - 1 );
1357 #undef CHECK_NEIGHBOUR
1359 for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
1360 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1362 for( int i = 0; i < 4; i++ )
1364 x264_me_t *l0m = &a->l0.me8x8[i];
1368 m.i_pixel = PIXEL_8x8;
1370 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1371 l0m->cost = INT_MAX;
1372 for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1374 m.i_ref_cost = REF_COST( 0, i_ref );
1376 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1377 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1379 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1380 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1381 if( h->mb.ref_blind_dupe == i_ref )
1383 CP32( m.mv, a->l0.mvc[0][i+1] );
1384 x264_me_refine_qpel_refdupe( h, &m, NULL );
1387 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1389 m.cost += m.i_ref_cost;
1391 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1393 if( m.cost < l0m->cost )
1394 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1395 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1396 i_ref = h->mb.ref_blind_dupe;
1400 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1401 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1403 a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
1405 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1406 are effectively zero. */
1407 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1408 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1411 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1412 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1413 /* P_8x8 ref0 has no ref cost */
1414 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1415 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1416 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1417 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1418 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1421 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1423 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1424 * reference frame flags. Thus, if we're not doing mixedrefs, just
1425 * don't bother analysing the dupes. */
1426 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1427 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1428 pixel **p_fenc = h->mb.pic.p_fenc;
1430 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1432 /* XXX Needed for x264_mb_predict_mv */
1433 h->mb.i_partition = D_8x8;
1436 CP32( mvc[0], a->l0.me16x16.mv );
1438 for( int i = 0; i < 4; i++ )
1440 x264_me_t *m = &a->l0.me8x8[i];
1444 m->i_pixel = PIXEL_8x8;
1445 m->i_ref_cost = i_ref_cost;
1447 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1448 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1449 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1451 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1452 x264_me_search( h, m, mvc, i_mvc );
1454 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1456 CP32( mvc[i_mvc], m->mv );
1459 a->i_satd8x8[0][i] = m->cost - m->cost_mv;
1462 m->cost += i_ref_cost;
1463 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1464 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1467 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1468 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1469 /* theoretically this should include 4*ref_cost,
1470 * but 3 seems a better approximation of cabac. */
1471 if( h->param.b_cabac )
1472 a->l0.i_cost8x8 -= i_ref_cost;
1473 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1474 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1477 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1480 pixel **p_fenc = h->mb.pic.p_fenc;
1481 ALIGNED_4( int16_t mvc[3][2] );
1483 /* XXX Needed for x264_mb_predict_mv */
1484 h->mb.i_partition = D_16x8;
1486 for( int i = 0; i < 2; i++ )
1488 x264_me_t *l0m = &a->l0.me16x8[i];
1489 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1490 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1491 const int ref8[2] = { minref, maxref };
1492 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1494 m.i_pixel = PIXEL_16x8;
1496 LOAD_FENC( &m, p_fenc, 0, 8*i );
1497 l0m->cost = INT_MAX;
1498 for( int j = 0; j < i_ref8s; j++ )
1500 const int i_ref = ref8[j];
1501 m.i_ref_cost = REF_COST( 0, i_ref );
1503 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1504 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1505 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1506 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1508 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1509 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1511 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1512 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1513 /* We can only take this shortcut if the first search was performed on ref0. */
1514 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1516 /* We can just leave the MV from the previous ref search. */
1517 x264_me_refine_qpel_refdupe( h, &m, NULL );
1520 x264_me_search( h, &m, mvc, 3 );
1522 m.cost += m.i_ref_cost;
1524 if( m.cost < l0m->cost )
1525 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1528 /* Early termination based on the current SATD score of partition[0]
1529 plus the estimated SATD score of partition[1] */
1530 if( !i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1532 a->l0.i_cost16x8 = COST_MAX;
1536 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1537 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1540 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1543 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
1546 pixel **p_fenc = h->mb.pic.p_fenc;
1547 ALIGNED_4( int16_t mvc[3][2] );
1549 /* XXX Needed for x264_mb_predict_mv */
1550 h->mb.i_partition = D_8x16;
1552 for( int i = 0; i < 2; i++ )
1554 x264_me_t *l0m = &a->l0.me8x16[i];
1555 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1556 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1557 const int ref8[2] = { minref, maxref };
1558 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1560 m.i_pixel = PIXEL_8x16;
1562 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1563 l0m->cost = INT_MAX;
1564 for( int j = 0; j < i_ref8s; j++ )
1566 const int i_ref = ref8[j];
1567 m.i_ref_cost = REF_COST( 0, i_ref );
1569 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1570 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1571 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1573 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1574 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1576 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1577 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1578 /* We can only take this shortcut if the first search was performed on ref0. */
1579 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1581 /* We can just leave the MV from the previous ref search. */
1582 x264_me_refine_qpel_refdupe( h, &m, NULL );
1585 x264_me_search( h, &m, mvc, 3 );
1587 m.cost += m.i_ref_cost;
1589 if( m.cost < l0m->cost )
1590 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1593 /* Early termination based on the current SATD score of partition[0]
1594 plus the estimated SATD score of partition[1] */
1595 if( !i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4 )
1597 a->l0.i_cost8x16 = COST_MAX;
1601 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1602 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1605 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1608 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
1610 ALIGNED_ARRAY_16( pixel, pix1,[16*8] );
1611 pixel *pix2 = pix1+8;
1612 const int i_stride = h->mb.pic.i_stride[1];
1613 const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride;
1614 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1615 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1616 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1617 x264_weight_t *weight = h->sh.weight[i_ref];
1619 // FIXME weight can be done on 4x4 blocks even if mc is smaller
1620 #define CHROMA4x4MC( width, height, me, x, y ) \
1621 h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1622 if( weight[1].weightfn ) \
1623 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1624 if( weight[2].weightfn ) \
1625 weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1628 if( size == PIXEL_4x4 )
1630 x264_me_t *m = a->l0.me4x4[i8x8];
1631 CHROMA4x4MC( 2,2, m[0], 0,0 );
1632 CHROMA4x4MC( 2,2, m[1], 2,0 );
1633 CHROMA4x4MC( 2,2, m[2], 0,2 );
1634 CHROMA4x4MC( 2,2, m[3], 2,2 );
1636 else if( size == PIXEL_8x4 )
1638 x264_me_t *m = a->l0.me8x4[i8x8];
1639 CHROMA4x4MC( 4,2, m[0], 0,0 );
1640 CHROMA4x4MC( 4,2, m[1], 0,2 );
1644 x264_me_t *m = a->l0.me4x8[i8x8];
1645 CHROMA4x4MC( 2,4, m[0], 0,0 );
1646 CHROMA4x4MC( 2,4, m[1], 2,0 );
1649 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1650 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1653 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1655 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1656 pixel **p_fenc = h->mb.pic.p_fenc;
1657 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1659 /* XXX Needed for x264_mb_predict_mv */
1660 h->mb.i_partition = D_8x8;
1662 for( int i4x4 = 0; i4x4 < 4; i4x4++ )
1664 const int idx = 4*i8x8 + i4x4;
1665 const int x4 = block_idx_x[idx];
1666 const int y4 = block_idx_y[idx];
1667 const int i_mvc = (i4x4 == 0);
1669 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1671 m->i_pixel = PIXEL_4x4;
1673 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1674 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1675 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1677 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1678 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1680 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1682 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1683 a->l0.me4x4[i8x8][1].cost +
1684 a->l0.me4x4[i8x8][2].cost +
1685 a->l0.me4x4[i8x8][3].cost +
1686 REF_COST( 0, i_ref ) +
1687 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1688 if( h->mb.b_chroma_me )
1689 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1692 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1694 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1695 pixel **p_fenc = h->mb.pic.p_fenc;
1696 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1698 /* XXX Needed for x264_mb_predict_mv */
1699 h->mb.i_partition = D_8x8;
1701 for( int i8x4 = 0; i8x4 < 2; i8x4++ )
1703 const int idx = 4*i8x8 + 2*i8x4;
1704 const int x4 = block_idx_x[idx];
1705 const int y4 = block_idx_y[idx];
1706 const int i_mvc = (i8x4 == 0);
1708 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1710 m->i_pixel = PIXEL_8x4;
1712 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1713 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1714 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1716 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1717 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1719 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1721 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1722 REF_COST( 0, i_ref ) +
1723 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1724 if( h->mb.b_chroma_me )
1725 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1728 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1730 pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1731 pixel **p_fenc = h->mb.pic.p_fenc;
1732 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1734 /* XXX Needed for x264_mb_predict_mv */
1735 h->mb.i_partition = D_8x8;
1737 for( int i4x8 = 0; i4x8 < 2; i4x8++ )
1739 const int idx = 4*i8x8 + i4x8;
1740 const int x4 = block_idx_x[idx];
1741 const int y4 = block_idx_y[idx];
1742 const int i_mvc = (i4x8 == 0);
1744 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1746 m->i_pixel = PIXEL_4x8;
1748 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1749 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1750 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1752 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1753 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1755 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1757 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1758 REF_COST( 0, i_ref ) +
1759 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1760 if( h->mb.b_chroma_me )
1761 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1764 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
1766 ALIGNED_ARRAY_16( pixel, pix, [4],[8*8] );
1767 ALIGNED_ARRAY_16( pixel, bi, [2],[8*8] );
1768 int l0_mvy_offset, l1_mvy_offset;
1769 int i_chroma_cost = 0;
1771 #define COST_BI_CHROMA( m0, m1, width, height ) \
1773 l0_mvy_offset = h->mb.b_interlaced & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1774 l1_mvy_offset = h->mb.b_interlaced & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
1775 h->mc.mc_chroma( pix[0], pix[1], 8, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \
1776 h->mc.mc_chroma( pix[2], pix[3], 8, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \
1777 h->mc.avg[i_pixel+3]( bi[0], 8, pix[0], 8, pix[2], 8, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1778 h->mc.avg[i_pixel+3]( bi[1], 8, pix[1], 8, pix[3], 8, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
1779 i_chroma_cost = h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[1], FENC_STRIDE, bi[0], 8 ); \
1780 i_chroma_cost += h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[2], FENC_STRIDE, bi[1], 8 ); \
1783 if( i_pixel == PIXEL_16x16 )
1784 COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 8, 8 )
1785 else if( i_pixel == PIXEL_16x8 )
1786 COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 8, 4 )
1787 else if( i_pixel == PIXEL_8x16 )
1788 COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 4, 8 )
1790 COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 4, 4 )
1792 return i_chroma_cost;
1795 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1797 /* Assumes that fdec still contains the results of
1798 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1800 pixel *p_fenc = h->mb.pic.p_fenc[0];
1801 pixel *p_fdec = h->mb.pic.p_fdec[0];
1803 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1804 if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
1805 for( int i = 0; i < 4; i++ )
1807 const int x = (i&1)*8;
1808 const int y = (i>>1)*8;
1809 a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
1810 &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
1811 if( h->mb.b_chroma_me )
1813 a->i_cost8x8direct[i] += h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][(x>>1)+(y>>1)*FENC_STRIDE], FENC_STRIDE,
1814 &h->mb.pic.p_fdec[1][(x>>1)+(y>>1)*FDEC_STRIDE], FDEC_STRIDE )
1815 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][(x>>1)+(y>>1)*FENC_STRIDE], FENC_STRIDE,
1816 &h->mb.pic.p_fdec[2][(x>>1)+(y>>1)*FDEC_STRIDE], FDEC_STRIDE );
1818 a->i_cost16x16direct += a->i_cost8x8direct[i];
1821 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1825 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
1826 if( h->mb.b_chroma_me )
1828 a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
1829 + h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
1834 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1836 ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
1837 ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
1839 int stride0 = 16, stride1 = 16;
1841 ALIGNED_4( int16_t mvc[9][2] );
1842 int try_skip = a->b_try_skip;
1843 int list1_skipped = 0;
1844 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1845 int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL,
1846 h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL};
1849 m.i_pixel = PIXEL_16x16;
1851 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1853 /* 16x16 Search on list 0 and list 1 */
1854 a->l0.me16x16.cost = INT_MAX;
1855 a->l1.me16x16.cost = INT_MAX;
1856 for( int l = 1; l >= 0; )
1858 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1860 /* This loop is extremely munged in order to facilitate the following order of operations,
1861 * necessary for an efficient fast skip.
1862 * 1. Search list1 ref0.
1863 * 2. Search list0 ref0.
1865 * 4. Search the rest of list0.
1866 * 5. Go back and finish list1.
1868 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1870 if( try_skip && l == 1 && i_ref > 0 )
1876 m.i_ref_cost = REF_COST( l, i_ref );
1878 /* search with ref */
1879 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1880 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1881 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1882 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
1885 m.cost += m.i_ref_cost;
1887 if( m.cost < lX->me16x16.cost )
1888 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1890 /* save mv for predicting neighbors */
1891 CP32( lX->mvc[i_ref][0], m.mv );
1892 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1894 /* Fast skip detection. */
1895 if( i_ref == 0 && try_skip )
1897 if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
1898 abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
1904 /* We already tested skip */
1905 h->mb.i_type = B_SKIP;
1906 x264_analyse_update_cache( h, a );
1911 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
1913 if( list1_skipped && l == 0 )
1919 /* get cost of BI mode */
1920 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1921 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1922 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1923 src0 = h->mc.get_ref( pix0, &stride0,
1924 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1925 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1926 src1 = h->mc.get_ref( pix1, &stride1,
1927 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1928 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1930 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1932 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1934 + a->l0.bi16x16.cost_mv
1935 + a->l1.bi16x16.cost_mv;
1937 if( h->mb.b_chroma_me )
1938 a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
1940 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1941 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1943 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1944 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1945 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1946 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1947 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1948 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1949 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1950 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1951 + ref_costs + l0_mv_cost + l1_mv_cost;
1953 if( h->mb.b_chroma_me )
1955 ALIGNED_ARRAY_16( pixel, pixuv, [2],[8*FENC_STRIDE] );
1956 ALIGNED_ARRAY_16( pixel, bi, [8*FENC_STRIDE] );
1958 if( h->mb.b_interlaced & a->l0.bi16x16.i_ref )
1960 int l0_mvy_offset = h->mb.b_interlaced & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1961 h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
1962 h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
1965 h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
1967 if( h->mb.b_interlaced & a->l1.bi16x16.i_ref )
1969 int l1_mvy_offset = h->mb.b_interlaced & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1970 h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
1971 h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
1974 h->mc.load_deinterleave_8x8x2_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
1976 h->mc.avg[PIXEL_8x8]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
1977 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1978 h->mc.avg[PIXEL_8x8]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
1979 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1981 cost00 += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
1982 + h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
1985 if( cost00 < a->i_cost16x16bi )
1987 M32( a->l0.bi16x16.mv ) = 0;
1988 M32( a->l1.bi16x16.mv ) = 0;
1989 a->l0.bi16x16.cost_mv = l0_mv_cost;
1990 a->l1.bi16x16.cost_mv = l1_mv_cost;
1991 a->i_cost16x16bi = cost00;
1996 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1997 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1998 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
2001 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
2006 switch( h->mb.i_sub_partition[i] )
2009 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
2012 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
2013 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
2016 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
2017 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
2020 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
2021 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
2022 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
2023 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
2026 x264_log( h, X264_LOG_ERROR, "internal error\n" );
2031 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
2035 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
2036 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
2037 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
2038 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
2041 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
2042 if( x264_mb_partition_listX_table[0][part] ) \
2044 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
2045 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
2049 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
2050 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
2052 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
2054 if( x264_mb_partition_listX_table[1][part] ) \
2056 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
2057 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
2061 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
2062 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
2064 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
2067 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2071 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
2073 x264_mb_load_mv_direct8x8( h, i );
2076 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
2077 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
2078 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
2083 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
2086 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2088 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
2090 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
2092 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
2096 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
2098 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2099 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
2101 /* early termination: if 16x16 chose ref 0, then evalute no refs older
2102 * than those used by the neighbors */
2103 #define CHECK_NEIGHBOUR(i)\
2105 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
2106 if( ref > i_maxref[l] )\
2110 for( int l = 0; l < 2; l++ )
2112 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2113 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
2114 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
2117 CHECK_NEIGHBOUR( -8 - 1 );
2118 CHECK_NEIGHBOUR( -8 + 0 );
2119 CHECK_NEIGHBOUR( -8 + 2 );
2120 CHECK_NEIGHBOUR( -8 + 4 );
2121 CHECK_NEIGHBOUR( 0 - 1 );
2122 CHECK_NEIGHBOUR( 2*8 - 1 );
2126 /* XXX Needed for x264_mb_predict_mv */
2127 h->mb.i_partition = D_8x8;
2131 for( int i = 0; i < 4; i++ )
2137 int stride[2] = {8,8};
2140 m.i_pixel = PIXEL_8x8;
2141 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2143 for( int l = 0; l < 2; l++ )
2145 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2147 lX->me8x8[i].cost = INT_MAX;
2148 for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
2150 m.i_ref_cost = REF_COST( l, i_ref );
2152 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
2154 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
2155 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2156 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
2157 m.cost += m.i_ref_cost;
2159 if( m.cost < lX->me8x8[i].cost )
2161 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
2162 a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
2165 /* save mv for predicting other partitions within this MB */
2166 CP32( lX->mvc[i_ref][i+1], m.mv );
2171 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
2172 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
2173 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
2174 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
2175 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
2176 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
2178 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2179 i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
2180 + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
2181 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2183 if( h->mb.b_chroma_me )
2185 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2186 i_part_cost_bi += i_chroma_cost;
2187 a->i_satd8x8[2][i] += i_chroma_cost;
2190 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2191 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2193 i_part_cost = a->l0.me8x8[i].cost;
2194 h->mb.i_sub_partition[i] = D_L0_8x8;
2195 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2196 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2197 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2198 a->i_cost8x8bi += i_part_cost;
2200 /* XXX Needed for x264_mb_predict_mv */
2201 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2205 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2208 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
2211 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
2212 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
2213 ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
2215 /* XXX Needed for x264_mb_predict_mv */
2216 h->mb.i_partition = D_8x8;
2220 for( int i = 0; i < 4; i++ )
2225 int i_part_cost_bi = 0;
2226 int stride[2] = {8,8};
2229 for( int l = 0; l < 2; l++ )
2231 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2232 x264_me_t *m = &lX->me8x8[i];
2233 m->i_pixel = PIXEL_8x8;
2234 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
2236 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
2237 m->i_ref = lX->me16x16.i_ref;
2239 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
2241 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
2242 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2243 x264_me_search( h, m, &lX->me16x16.mv, 1 );
2244 a->i_satd8x8[l][i] = m->cost - m->cost_mv;
2245 m->cost += m->i_ref_cost;
2247 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
2249 /* save mv for predicting other partitions within this MB */
2250 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2253 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2254 m->mv[0], m->mv[1], 8, 8, weight_none );
2255 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2257 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2258 a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2259 i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2260 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2261 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2263 if( h->mb.b_chroma_me )
2265 int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
2266 i_part_cost_bi += i_chroma_cost;
2267 a->i_satd8x8[2][i] += i_chroma_cost;
2270 i_part_cost = a->l0.me8x8[i].cost;
2271 h->mb.i_sub_partition[i] = D_L0_8x8;
2272 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2273 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2274 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2275 a->i_cost8x8bi += i_part_cost;
2277 /* XXX Needed for x264_mb_predict_mv */
2278 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2282 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2285 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2287 ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
2288 ALIGNED_4( int16_t mvc[3][2] );
2290 h->mb.i_partition = D_16x8;
2291 a->i_cost16x8bi = 0;
2293 for( int i = 0; i < 2; i++ )
2296 int i_part_cost_bi = 0;
2297 int stride[2] = {16,16};
2300 m.i_pixel = PIXEL_16x8;
2301 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2303 for( int l = 0; l < 2; l++ )
2305 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2306 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2307 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2308 lX->me16x8[i].cost = INT_MAX;
2309 for( int j = 0; j < i_ref8s; j++ )
2311 int i_ref = ref8[j];
2312 m.i_ref_cost = REF_COST( l, i_ref );
2314 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2316 CP32( mvc[0], lX->mvc[i_ref][0] );
2317 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2318 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2320 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2321 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2322 x264_me_search( h, &m, mvc, 3 );
2323 m.cost += m.i_ref_cost;
2325 if( m.cost < lX->me16x8[i].cost )
2326 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2331 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2332 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2333 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2334 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2335 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2336 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2338 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2339 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2340 + a->l1.me16x8[i].i_ref_cost;
2342 if( h->mb.b_chroma_me )
2343 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
2345 i_part_cost = a->l0.me16x8[i].cost;
2346 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2348 if( a->l1.me16x8[i].cost < i_part_cost )
2350 i_part_cost = a->l1.me16x8[i].cost;
2351 a->i_mb_partition16x8[i] = D_L1_8x8;
2353 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2355 i_part_cost = i_part_cost_bi;
2356 a->i_mb_partition16x8[i] = D_BI_8x8;
2358 a->i_cost16x8bi += i_part_cost;
2360 /* Early termination based on the current SATD score of partition[0]
2361 plus the estimated SATD score of partition[1] */
2362 if( !i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
2363 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2365 a->i_cost16x8bi = COST_MAX;
2369 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2373 a->i_mb_type16x8 = B_L0_L0
2374 + (a->i_mb_partition16x8[0]>>2) * 3
2375 + (a->i_mb_partition16x8[1]>>2);
2376 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2379 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
2381 ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
2382 ALIGNED_4( int16_t mvc[3][2] );
2384 h->mb.i_partition = D_8x16;
2385 a->i_cost8x16bi = 0;
2387 for( int i = 0; i < 2; i++ )
2390 int i_part_cost_bi = 0;
2391 int stride[2] = {8,8};
2394 m.i_pixel = PIXEL_8x16;
2395 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2397 for( int l = 0; l < 2; l++ )
2399 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2400 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2401 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2402 lX->me8x16[i].cost = INT_MAX;
2403 for( int j = 0; j < i_ref8s; j++ )
2405 int i_ref = ref8[j];
2406 m.i_ref_cost = REF_COST( l, i_ref );
2408 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2410 CP32( mvc[0], lX->mvc[i_ref][0] );
2411 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2412 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2414 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2415 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2416 x264_me_search( h, &m, mvc, 3 );
2417 m.cost += m.i_ref_cost;
2419 if( m.cost < lX->me8x16[i].cost )
2420 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2425 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2426 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2427 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2428 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2429 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2431 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2432 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2433 + a->l1.me8x16[i].i_ref_cost;
2435 if( h->mb.b_chroma_me )
2436 i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
2438 i_part_cost = a->l0.me8x16[i].cost;
2439 a->i_mb_partition8x16[i] = D_L0_8x8;
2441 if( a->l1.me8x16[i].cost < i_part_cost )
2443 i_part_cost = a->l1.me8x16[i].cost;
2444 a->i_mb_partition8x16[i] = D_L1_8x8;
2446 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2448 i_part_cost = i_part_cost_bi;
2449 a->i_mb_partition8x16[i] = D_BI_8x8;
2451 a->i_cost8x16bi += i_part_cost;
2453 /* Early termination based on the current SATD score of partition[0]
2454 plus the estimated SATD score of partition[1] */
2455 if( !i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
2456 * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16 )
2458 a->i_cost8x16bi = COST_MAX;
2462 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2466 a->i_mb_type8x16 = B_L0_L0
2467 + (a->i_mb_partition8x16[0]>>2) * 3
2468 + (a->i_mb_partition8x16[1]>>2);
2469 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2472 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2474 int thresh = i_satd * 5/4;
2476 h->mb.i_type = P_L0;
2477 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2479 h->mb.i_partition = D_16x16;
2480 x264_analyse_update_cache( h, a );
2481 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2484 if( a->l0.i_cost16x8 <= thresh )
2486 h->mb.i_partition = D_16x8;
2487 x264_analyse_update_cache( h, a );
2488 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2491 a->l0.i_cost16x8 = COST_MAX;
2493 if( a->l0.i_cost8x16 <= thresh )
2495 h->mb.i_partition = D_8x16;
2496 x264_analyse_update_cache( h, a );
2497 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2500 a->l0.i_cost8x16 = COST_MAX;
2502 if( a->l0.i_cost8x8 <= thresh )
2504 h->mb.i_type = P_8x8;
2505 h->mb.i_partition = D_8x8;
2506 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2508 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2509 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2510 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2511 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2512 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2513 * for future blocks are those left over from previous RDO calls. */
2514 for( int i = 0; i < 4; i++ )
2516 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2517 int sub8x8_thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2518 int subtype, btype = D_L0_8x8;
2519 uint64_t bcost = COST_MAX64;
2520 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2523 if( costs[subtype] > sub8x8_thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2525 h->mb.i_sub_partition[i] = subtype;
2526 x264_mb_cache_mv_p8x8( h, a, i );
2527 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2528 COPY2_IF_LT( bcost, cost, btype, subtype );
2530 if( h->mb.i_sub_partition[i] != btype )
2532 h->mb.i_sub_partition[i] = btype;
2533 x264_mb_cache_mv_p8x8( h, a, i );
2538 x264_analyse_update_cache( h, a );
2539 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2542 a->l0.i_cost8x8 = COST_MAX;
2545 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2547 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2549 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2551 h->mb.i_type = B_DIRECT;
2552 /* Assumes direct/skip MC is still in fdec */
2553 /* Requires b-rdo to be done before intra analysis */
2554 h->mb.b_skip_mc = 1;
2555 x264_analyse_update_cache( h, a );
2556 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2557 h->mb.b_skip_mc = 0;
2560 //FIXME not all the update_cache calls are needed
2561 h->mb.i_partition = D_16x16;
2563 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2565 h->mb.i_type = B_L0_L0;
2566 x264_analyse_update_cache( h, a );
2567 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2571 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2573 h->mb.i_type = B_L1_L1;
2574 x264_analyse_update_cache( h, a );
2575 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2579 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2581 h->mb.i_type = B_BI_BI;
2582 x264_analyse_update_cache( h, a );
2583 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2587 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2589 h->mb.i_type = B_8x8;
2590 h->mb.i_partition = D_8x8;
2591 x264_analyse_update_cache( h, a );
2592 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2593 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2597 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2599 h->mb.i_type = a->i_mb_type16x8;
2600 h->mb.i_partition = D_16x8;
2601 x264_analyse_update_cache( h, a );
2602 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2606 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2608 h->mb.i_type = a->i_mb_type8x16;
2609 h->mb.i_partition = D_8x16;
2610 x264_analyse_update_cache( h, a );
2611 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2615 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2619 if( IS_INTRA(h->mb.i_type) )
2622 switch( h->mb.i_partition )
2625 if( h->mb.i_type == B_BI_BI )
2627 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2628 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2632 for( int i = 0; i < 2; i++ )
2633 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2635 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2636 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2640 for( int i = 0; i < 2; i++ )
2641 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2643 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2644 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2648 for( int i = 0; i < 4; i++ )
2649 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2651 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2652 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2658 static inline void x264_mb_analyse_transform( x264_t *h )
2660 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2662 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2665 int i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2666 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2667 int i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2668 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2670 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2671 h->mb.b_skip_mc = 1;
2675 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2677 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2679 x264_analyse_update_cache( h, a );
2680 h->mb.b_transform_8x8 ^= 1;
2681 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2682 int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2684 if( *i_rd >= i_rd8 )
2687 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2691 h->mb.b_transform_8x8 ^= 1;
2695 /* Rate-distortion optimal QP selection.
2696 * FIXME: More than half of the benefit of this function seems to be
2697 * in the way it improves the coding of chroma DC (by decimating or
2698 * finding a better way to code a single DC coefficient.)
2699 * There must be a more efficient way to get that portion of the benefit
2700 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2702 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2704 int bcost, cost, failures, prevcost, origcost;
2705 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2706 int last_qp_tried = 0;
2707 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2708 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2710 /* If CBP is already zero, don't raise the quantizer any higher. */
2711 for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2713 /* Without psy-RD, require monotonicity when moving quant away from previous
2714 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2715 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2716 * allow 2 failures when moving quant towards previous quant.
2717 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2718 int threshold = (!!h->mb.i_psy_rd);
2719 /* Raise the threshold for failures if we're moving towards the last QP. */
2720 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2721 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2723 h->mb.i_qp = orig_qp;
2725 prevcost = origcost;
2727 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2728 * (up to a point) will too. So, jump down to where the threshold will kick in
2729 * and check the QP there. If the CBP is still empty, skip the main loop.
2730 * If it isn't empty, we would have ended up having to check this QP anyways,
2731 * so as long as we store it for later lookup, we lose nothing. */
2732 int already_checked_qp = -1;
2733 int already_checked_cost = COST_MAX;
2734 if( direction == -1 )
2738 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2739 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2740 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2741 if( !h->mb.cbp[h->mb.i_mb_xy] )
2743 /* If our empty-CBP block is lower QP than the last QP,
2744 * the last QP almost surely doesn't have a CBP either. */
2745 if( h->mb.i_last_qp > h->mb.i_qp )
2749 already_checked_qp = h->mb.i_qp;
2750 h->mb.i_qp = orig_qp;
2754 h->mb.i_qp += direction;
2755 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
2757 if( h->mb.i_last_qp == h->mb.i_qp )
2759 if( h->mb.i_qp == already_checked_qp )
2760 cost = already_checked_cost;
2763 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2764 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2765 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2768 /* We can't assume that the costs are monotonic over QPs.
2769 * Tie case-as-failure seems to give better results. */
2770 if( cost < prevcost )
2776 if( failures > threshold )
2778 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2780 h->mb.i_qp += direction;
2784 /* Always try the last block's QP. */
2785 if( !last_qp_tried )
2787 h->mb.i_qp = h->mb.i_last_qp;
2788 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2789 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2790 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2794 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2796 /* Check transform again; decision from before may no longer be optimal. */
2797 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2798 x264_mb_transform_8x8_allowed( h ) )
2800 h->mb.b_transform_8x8 ^= 1;
2801 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2803 h->mb.b_transform_8x8 ^= 1;
2807 /*****************************************************************************
2808 * x264_macroblock_analyse:
2809 *****************************************************************************/
2810 void x264_macroblock_analyse( x264_t *h )
2812 x264_mb_analysis_t analysis;
2813 int i_cost = COST_MAX;
2815 h->mb.i_qp = x264_ratecontrol_mb_qp( h );
2816 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2817 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2818 if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2819 h->mb.i_qp = h->mb.i_last_qp;
2821 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2823 /*--------------------------- Do the analysis ---------------------------*/
2824 if( h->sh.i_type == SLICE_TYPE_I )
2827 if( analysis.i_mbrd )
2828 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2829 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2830 if( analysis.i_mbrd )
2831 x264_intra_rd( h, &analysis, COST_MAX );
2833 i_cost = analysis.i_satd_i16x16;
2834 h->mb.i_type = I_16x16;
2835 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2836 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2837 if( analysis.i_satd_pcm < i_cost )
2838 h->mb.i_type = I_PCM;
2840 else if( analysis.i_mbrd >= 2 )
2841 x264_intra_rd_refine( h, &analysis );
2843 else if( h->sh.i_type == SLICE_TYPE_P )
2847 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2849 analysis.b_try_skip = 0;
2850 if( analysis.b_force_intra )
2852 if( !h->param.analyse.b_psy )
2854 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2855 goto intra_analysis;
2860 /* Fast P_SKIP detection */
2861 if( h->param.analyse.b_fast_pskip )
2863 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2864 // FIXME don't need to check this if the reference frame is done
2866 else if( h->param.analyse.i_subpel_refine >= 3 )
2867 analysis.b_try_skip = 1;
2868 else if( h->mb.i_mb_type_left[0] == P_SKIP ||
2869 h->mb.i_mb_type_top == P_SKIP ||
2870 h->mb.i_mb_type_topleft == P_SKIP ||
2871 h->mb.i_mb_type_topright == P_SKIP )
2872 b_skip = x264_macroblock_probe_pskip( h );
2876 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2880 h->mb.i_type = P_SKIP;
2881 h->mb.i_partition = D_16x16;
2882 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2883 /* Set up MVs for future predictors */
2884 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
2885 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2889 const unsigned int flags = h->param.analyse.inter;
2893 int i_satd_inter, i_satd_intra;
2895 x264_mb_analyse_load_costs( h, &analysis );
2897 x264_mb_analyse_inter_p16x16( h, &analysis );
2899 if( h->mb.i_type == P_SKIP )
2901 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
2902 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2906 if( flags & X264_ANALYSE_PSUB16x16 )
2908 if( h->param.analyse.b_mixed_references )
2909 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2911 x264_mb_analyse_inter_p8x8( h, &analysis );
2914 /* Select best inter mode */
2916 i_partition = D_16x16;
2917 i_cost = analysis.l0.me16x16.cost;
2919 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2920 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2923 i_partition = D_8x8;
2924 i_cost = analysis.l0.i_cost8x8;
2927 if( flags & X264_ANALYSE_PSUB8x8 )
2929 for( int i = 0; i < 4; i++ )
2931 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2932 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2934 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2935 h->mb.i_sub_partition[i] = D_L0_4x4;
2937 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2938 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2939 h->mb.i_sub_partition[i], D_L0_8x4 );
2941 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2942 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2943 h->mb.i_sub_partition[i], D_L0_4x8 );
2945 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2947 x264_mb_cache_mv_p8x8( h, &analysis, i );
2949 analysis.l0.i_cost8x8 = i_cost;
2953 /* Now do 16x8/8x16 */
2954 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2955 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2956 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2958 int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
2959 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2960 analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2962 x264_mb_analyse_inter_p16x8( h, &analysis, i_cost );
2963 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2965 i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
2966 + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
2967 analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
2969 x264_mb_analyse_inter_p8x16( h, &analysis, i_cost );
2970 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2973 h->mb.i_partition = i_partition;
2976 //FIXME mb_type costs?
2977 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2981 else if( i_partition == D_16x16 )
2983 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2984 i_cost = analysis.l0.me16x16.cost;
2986 else if( i_partition == D_16x8 )
2988 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2989 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2990 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2992 else if( i_partition == D_8x16 )
2994 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2995 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2996 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2998 else if( i_partition == D_8x8 )
3001 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3003 switch( h->mb.i_sub_partition[i8x8] )
3006 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
3007 i_cost += analysis.l0.me8x8[i8x8].cost;
3010 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
3011 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
3012 i_cost += analysis.l0.me8x4[i8x8][0].cost +
3013 analysis.l0.me8x4[i8x8][1].cost;
3016 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
3017 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
3018 i_cost += analysis.l0.me4x8[i8x8][0].cost +
3019 analysis.l0.me4x8[i8x8][1].cost;
3023 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
3024 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
3025 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
3026 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
3027 i_cost += analysis.l0.me4x4[i8x8][0].cost +
3028 analysis.l0.me4x4[i8x8][1].cost +
3029 analysis.l0.me4x4[i8x8][2].cost +
3030 analysis.l0.me4x4[i8x8][3].cost;
3033 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
3039 if( h->mb.b_chroma_me )
3041 x264_mb_analyse_intra_chroma( h, &analysis );
3042 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
3043 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
3044 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
3045 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
3048 x264_mb_analyse_intra( h, &analysis, i_cost );
3050 i_satd_inter = i_cost;
3051 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
3052 analysis.i_satd_i8x8,
3053 analysis.i_satd_i4x4 );
3055 if( analysis.i_mbrd )
3057 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
3059 i_partition = D_16x16;
3060 i_cost = analysis.l0.i_rd16x16;
3061 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
3062 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
3063 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
3064 h->mb.i_type = i_type;
3065 h->mb.i_partition = i_partition;
3066 if( i_cost < COST_MAX )
3067 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3068 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
3071 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3072 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3073 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3074 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3076 h->mb.i_type = i_type;
3078 if( analysis.b_force_intra && !IS_INTRA(i_type) )
3080 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
3081 * it was an inter block. */
3082 x264_analyse_update_cache( h, &analysis );
3083 x264_macroblock_encode( h );
3084 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
3085 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
3086 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
3087 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
3088 goto intra_analysis;
3091 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
3093 if( IS_INTRA( h->mb.i_type ) )
3095 x264_intra_rd_refine( h, &analysis );
3097 else if( i_partition == D_16x16 )
3099 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
3100 analysis.l0.me16x16.cost = i_cost;
3101 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3103 else if( i_partition == D_16x8 )
3105 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3106 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3107 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
3108 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
3109 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
3110 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
3112 else if( i_partition == D_8x16 )
3114 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
3115 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
3116 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
3117 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
3118 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
3119 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
3121 else if( i_partition == D_8x8 )
3123 x264_analyse_update_cache( h, &analysis );
3124 for( int i8x8 = 0; i8x8 < 4; i8x8++ )
3126 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
3128 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
3130 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
3132 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3133 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
3135 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
3137 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3138 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3140 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
3142 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
3143 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
3144 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
3145 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
3152 else if( h->sh.i_type == SLICE_TYPE_B )
3154 int i_bskip_cost = COST_MAX;
3157 if( analysis.i_mbrd )
3158 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
3160 h->mb.i_type = B_SKIP;
3161 if( h->mb.b_direct_auto_write )
3163 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
3164 for( int i = 0; i < 2; i++ )
3167 h->sh.b_direct_spatial_mv_pred ^= 1;
3168 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
3169 if( analysis.b_direct_available )
3174 b_skip = x264_macroblock_probe_bskip( h );
3176 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
3183 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
3185 analysis.b_try_skip = 0;
3186 if( analysis.b_direct_available )
3188 if( !h->mb.b_direct_auto_write )
3190 if( analysis.i_mbrd )
3192 i_bskip_cost = ssd_mb( h );
3193 /* 6 = minimum cavlc cost of a non-skipped MB */
3194 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
3196 else if( !h->mb.b_direct_auto_write )
3198 /* Conditioning the probe on neighboring block types
3199 * doesn't seem to help speed or quality. */
3200 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
3201 if( h->param.analyse.i_subpel_refine < 3 )
3202 b_skip = analysis.b_try_skip;
3204 /* Set up MVs for future predictors */
3207 for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
3208 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3209 for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
3210 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
3216 const unsigned int flags = h->param.analyse.inter;
3220 h->mb.b_skip_mc = 0;
3221 h->mb.i_type = B_DIRECT;
3223 x264_mb_analyse_load_costs( h, &analysis );
3225 /* select best inter mode */
3226 /* direct must be first */
3227 if( analysis.b_direct_available )
3228 x264_mb_analyse_inter_direct( h, &analysis );
3230 x264_mb_analyse_inter_b16x16( h, &analysis );
3232 if( h->mb.i_type == B_SKIP )
3234 for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
3235 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3236 for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
3237 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
3242 i_partition = D_16x16;
3243 i_cost = analysis.l0.me16x16.cost;
3244 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
3245 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
3246 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
3248 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
3250 x264_mb_analyse_b_rd( h, &analysis, i_cost );
3251 if( i_bskip_cost < analysis.i_rd16x16direct &&
3252 i_bskip_cost < analysis.i_rd16x16bi &&
3253 i_bskip_cost < analysis.l0.i_rd16x16 &&
3254 i_bskip_cost < analysis.l1.i_rd16x16 )
3256 h->mb.i_type = B_SKIP;
3257 x264_analyse_update_cache( h, &analysis );
3262 if( flags & X264_ANALYSE_BSUB16x16 )
3264 if( h->param.analyse.b_mixed_references )
3265 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
3267 x264_mb_analyse_inter_b8x8( h, &analysis );
3269 COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3271 /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
3272 int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
3273 int i_mb_type, i_partition16x8[2], i_partition8x16[2];
3274 for( int i = 0; i < 2; i++ )
3276 int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
3277 int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
3279 i_best_cost = COST_MAX;
3280 i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
3281 i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
3282 i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
3283 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
3284 + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3285 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
3286 + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
3287 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
3288 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
3289 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
3290 analysis.i_cost_est16x8[i] = i_best_cost;
3293 i_best_cost = COST_MAX;
3294 i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
3295 i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
3296 i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
3297 avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
3298 + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3299 avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
3300 + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
3301 COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
3302 COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
3303 COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
3304 analysis.i_cost_est8x16[i] = i_best_cost;
3306 i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
3307 analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3308 i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
3309 i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
3310 analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
3311 i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
3313 /* We can gain a little speed by checking the mode with the lowest estimated cost first */
3314 int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
3315 if( try_16x8_first && i_cost_est16x8bi_total < i_cost )
3317 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3318 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3320 if( i_cost_est8x16bi_total < i_cost )
3322 x264_mb_analyse_inter_b8x16( h, &analysis, i_cost );
3323 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3325 if( !try_16x8_first && i_cost_est16x8bi_total < i_cost )
3327 x264_mb_analyse_inter_b16x8( h, &analysis, i_cost );
3328 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3332 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3337 else if( i_partition == D_16x16 )
3339 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3340 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3341 if( i_type == B_L0_L0 )
3343 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3344 i_cost = analysis.l0.me16x16.cost
3345 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3347 else if( i_type == B_L1_L1 )
3349 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3350 i_cost = analysis.l1.me16x16.cost
3351 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3353 else if( i_type == B_BI_BI )
3355 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3356 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3359 else if( i_partition == D_16x8 )
3361 for( int i = 0; i < 2; i++ )
3363 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3364 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3365 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3366 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3369 else if( i_partition == D_8x16 )
3371 for( int i = 0; i < 2; i++ )
3373 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3374 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3375 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3376 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3379 else if( i_partition == D_8x8 )
3381 for( int i = 0; i < 4; i++ )
3384 int i_part_cost_old;
3386 int i_part_type = h->mb.i_sub_partition[i];
3387 int b_bidir = (i_part_type == D_BI_8x8);
3389 if( i_part_type == D_DIRECT_8x8 )
3391 if( x264_mb_partition_listX_table[0][i_part_type] )
3393 m = &analysis.l0.me8x8[i];
3394 i_part_cost_old = m->cost;
3395 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3396 m->cost -= i_type_cost;
3397 x264_me_refine_qpel( h, m );
3399 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3401 if( x264_mb_partition_listX_table[1][i_part_type] )
3403 m = &analysis.l1.me8x8[i];
3404 i_part_cost_old = m->cost;
3405 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3406 m->cost -= i_type_cost;
3407 x264_me_refine_qpel( h, m );
3409 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3411 /* TODO: update mvp? */
3415 i_satd_inter = i_cost;
3417 if( analysis.i_mbrd )
3419 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3421 i_cost = i_bskip_cost;
3422 i_partition = D_16x16;
3423 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3424 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3425 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3426 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3427 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3428 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3429 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3431 h->mb.i_type = i_type;
3432 h->mb.i_partition = i_partition;
3435 if( h->mb.b_chroma_me )
3437 x264_mb_analyse_intra_chroma( h, &analysis );
3438 x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_i8x8chroma );
3439 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
3440 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
3441 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
3444 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3446 if( analysis.i_mbrd )
3448 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3449 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3452 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3453 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3454 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3455 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3457 h->mb.i_type = i_type;
3458 h->mb.i_partition = i_partition;
3460 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3461 x264_intra_rd_refine( h, &analysis );
3462 if( h->mb.i_subpel_refine >= 5 )
3463 x264_refine_bidir( h, &analysis );
3465 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3468 x264_analyse_update_cache( h, &analysis );
3470 if( i_partition == D_16x16 )
3472 if( i_type == B_L0_L0 )
3474 analysis.l0.me16x16.cost = i_cost;
3475 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3477 else if( i_type == B_L1_L1 )
3479 analysis.l1.me16x16.cost = i_cost;
3480 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3482 else if( i_type == B_BI_BI )
3484 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3485 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3488 else if( i_partition == D_16x8 )
3490 for( int i = 0; i < 2; i++ )
3492 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3493 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3494 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3495 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3496 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3497 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3499 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3500 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3504 else if( i_partition == D_8x16 )
3506 for( int i = 0; i < 2; i++ )
3508 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3509 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3510 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3511 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3512 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3513 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3515 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3516 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3520 else if( i_partition == D_8x8 )
3522 for( int i = 0; i < 4; i++ )
3524 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3525 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3526 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3527 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3528 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3530 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3531 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3539 x264_analyse_update_cache( h, &analysis );
3541 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3542 * without realizing it. Check for this and account for it if necessary. */
3543 if( analysis.i_mbrd >= 2 )
3545 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3546 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3547 int list = check_mv_lists[h->mb.i_type] - 1;
3548 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3549 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3550 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3551 h->mb.i_partition = D_16x16;
3554 if( !analysis.i_mbrd )
3555 x264_mb_analyse_transform( h );
3557 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3558 x264_mb_analyse_qp_rd( h, &analysis );
3560 h->mb.b_trellis = h->param.analyse.i_trellis;
3561 h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
3563 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3564 x264_psy_trellis_init( h, 0 );
3565 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3566 h->mb.i_skip_intra = 0;
3569 /*-------------------- Update MB from the analysis ----------------------*/
3570 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3572 switch( h->mb.i_type )
3575 for( int i = 0; i < 16; i++ )
3576 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3578 x264_mb_analyse_intra_chroma( h, a );
3581 for( int i = 0; i < 4; i++ )
3582 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3584 x264_mb_analyse_intra_chroma( h, a );
3587 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3588 x264_mb_analyse_intra_chroma( h, a );
3595 switch( h->mb.i_partition )
3598 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3599 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3603 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3604 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3605 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3606 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3610 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3611 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3612 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3613 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3617 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3623 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3624 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3625 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3626 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3627 for( int i = 0; i < 4; i++ )
3628 x264_mb_cache_mv_p8x8( h, a, i );
3633 h->mb.i_partition = D_16x16;
3634 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3635 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3641 h->mb.i_partition = h->mb.cache.direct_partition;
3642 x264_mb_load_mv_direct8x8( h, 0 );
3643 x264_mb_load_mv_direct8x8( h, 1 );
3644 x264_mb_load_mv_direct8x8( h, 2 );
3645 x264_mb_load_mv_direct8x8( h, 3 );
3649 /* optimize: cache might not need to be rewritten */
3650 for( int i = 0; i < 4; i++ )
3651 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3654 default: /* the rest of the B types */
3655 switch( h->mb.i_partition )
3658 switch( h->mb.i_type )
3661 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3662 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3664 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3665 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3666 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3669 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3670 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3671 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3673 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3674 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3677 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3678 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3680 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3681 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3686 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3687 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3690 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3691 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3694 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3700 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3702 for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3705 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3708 completed = h->fref[l][ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3709 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3711 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3712 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3713 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3714 h->mb.cache.mv[l][x264_scan8[15]][0],
3715 h->mb.cache.mv[l][x264_scan8[15]][1] );
3716 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3717 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3718 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3719 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3720 x264_mb_analyse_intra( h, a, COST_MAX );
3721 h->mb.i_type = I_16x16;
3722 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3723 x264_mb_analyse_intra_chroma( h, a );
3730 #include "slicetype.c"