1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[7];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
125 int i_mb_partition16x8[2]; /* mb_partition_e */
126 int i_mb_partition8x16[2];
127 int i_mb_type16x8; /* mb_class_e */
130 int b_direct_available;
132 } x264_mb_analysis_t;
134 /* lambda = pow(2,qp/6-2) */
135 const uint8_t x264_lambda_tab[52] = {
136 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
137 1, 1, 1, 1, /* 8-11 */
138 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
139 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
140 6, 7, 8, 9,10,11,13,14, /* 28-35 */
141 16,18,20,23,25,29,32,36, /* 36-43 */
142 40,45,51,57,64,72,81,91 /* 44-51 */
145 /* lambda2 = pow(lambda,2) * .9 * 256 */
146 const int x264_lambda2_tab[52] = {
147 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
148 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
149 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
150 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
151 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
152 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
153 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
156 const uint8_t x264_exp2_lut[64] = {
157 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
158 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
159 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
160 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
163 const float x264_log2_lut[128] = {
164 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
165 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
166 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
167 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
168 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
169 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
170 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
171 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
172 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
173 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
174 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
175 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
176 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
177 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
178 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
179 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
182 /* Avoid an int/float conversion. */
183 const float x264_log2_lz_lut[32] = {
184 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
187 // should the intra and inter lambdas be different?
188 // I'm just matching the behaviour of deadzone quant.
189 static const int x264_trellis_lambda2_tab[2][52] = {
190 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
191 { 46, 58, 73, 92, 117, 147,
192 185, 233, 294, 370, 466, 587,
193 740, 932, 1174, 1480, 1864, 2349,
194 2959, 3728, 4697, 5918, 7457, 9395,
195 11837, 14914, 18790, 23674, 29828, 37581,
196 47349, 59656, 75163, 94699, 119313, 150326,
197 189399, 238627, 300652, 378798, 477255, 601304,
198 757596, 954511, 1202608, 1515192, 1909022, 2405217,
199 3030384, 3818045, 4810435, 6060769 },
200 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
201 { 27, 34, 43, 54, 68, 86,
202 108, 136, 172, 216, 273, 343,
203 433, 545, 687, 865, 1090, 1374,
204 1731, 2180, 2747, 3461, 4361, 5494,
205 6922, 8721, 10988, 13844, 17442, 21976,
206 27688, 34885, 43953, 55377, 69771, 87906,
207 110755, 139543, 175813, 221511, 279087, 351627,
208 443023, 558174, 703255, 886046, 1116348, 1406511,
209 1772093, 2232697, 2813022, 3544186 }
212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
213 16, 20, 25, 32, 40, 50,
214 64, 80, 101, 128, 161, 203,
215 256, 322, 406, 512, 645, 812,
216 1024, 1290, 1625, 2048, 2580, 3250,
217 4096, 5160, 6501, 8192, 10321, 13003,
218 16384, 20642, 26007, 32768, 41285, 52015,
222 /* TODO: calculate CABAC costs */
223 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
224 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
226 static const uint8_t i_mb_b16x8_cost_table[17] = {
227 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
229 static const uint8_t i_sub_mb_b_cost_table[13] = {
230 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
232 static const uint8_t i_sub_mb_p_cost_table[4] = {
236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
238 static uint16_t x264_cost_ref[92][3][33];
239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
241 int x264_analyse_init_costs( x264_t *h, int qp )
244 int lambda = x264_lambda_tab[qp];
245 if( h->cost_mv[lambda] )
247 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
248 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
249 h->cost_mv[lambda] += 2*4*2048;
250 for( i = 0; i <= 2*4*2048; i++ )
252 h->cost_mv[lambda][-i] =
253 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
255 x264_pthread_mutex_lock( &cost_ref_mutex );
256 for( i = 0; i < 3; i++ )
257 for( j = 0; j < 33; j++ )
258 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
259 x264_pthread_mutex_unlock( &cost_ref_mutex );
260 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
264 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
265 h->cost_mv_fpel[lambda][j] += 2*2048;
266 for( i = -2*2048; i < 2*2048; i++ )
267 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
275 void x264_analyse_free_costs( x264_t *h )
278 for( i = 0; i < 92; i++ )
281 x264_free( h->cost_mv[i] - 2*4*2048 );
282 if( h->cost_mv_fpel[i][0] )
283 for( j = 0; j < 4; j++ )
284 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
288 void x264_analyse_weight_frame( x264_t *h, int end )
291 for( j=0; j<h->i_ref0; j++ )
293 if( h->sh.weight[j][0].weightfn )
295 x264_frame_t *frame = h->fref0[j];
296 int width = frame->i_width[0] + 2*PADH;
297 int i_padv = PADV << h->param.b_interlaced;
299 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
301 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
302 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
303 h->fenc->i_lines_weighted += height;
306 for( k = j; k < h->i_ref0; k++ )
307 if( h->sh.weight[k][0].weightfn )
309 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
310 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
311 src + offset, frame->i_stride[0],
312 width, height, &h->sh.weight[k][0] );
320 /* initialize an array of lambda*nbits for all possible mvs */
321 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
323 a->p_cost_mv = h->cost_mv[a->i_lambda];
324 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
325 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
328 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
330 /* conduct the analysis using this lamda and QP */
331 a->i_qp = h->mb.i_qp = i_qp;
332 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
334 a->i_lambda = x264_lambda_tab[i_qp];
335 a->i_lambda2 = x264_lambda2_tab[i_qp];
337 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
338 if( h->param.analyse.i_trellis )
340 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
341 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
342 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
343 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
345 h->mb.i_psy_rd_lambda = a->i_lambda;
346 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
347 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
350 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
352 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
354 /* mbrd == 1 -> RD mode decision */
355 /* mbrd == 2 -> RD refinement */
356 /* mbrd == 3 -> QPRD */
357 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
359 x264_mb_analyse_init_qp( h, a, i_qp );
361 h->mb.b_transform_8x8 = 0;
362 h->mb.b_noise_reduction = 0;
368 a->i_satd_i8x8chroma = COST_MAX;
370 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
371 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
375 h->mb.b_lossless ? 0 :
377 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
379 /* II: Inter part P/B frame */
380 if( h->sh.i_type != SLICE_TYPE_I )
383 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
384 // limit motion search to a slightly smaller range than the theoretical limit,
385 // since the search may go a few iterations past its given range
386 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
388 /* Calculate max allowed MV range */
389 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
390 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
391 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
392 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
393 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
394 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
396 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
397 int max_mv = max_x - 4*16*h->mb.i_mb_x;
398 /* If we're left of the refresh bar, don't reference right of it. */
399 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
400 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
402 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
403 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
404 if( h->mb.i_mb_x == 0 )
406 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
407 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
408 int thread_mvy_range = i_fmv_range;
410 if( h->i_thread_frames > 1 )
412 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
413 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
414 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
416 x264_frame_t **fref = i ? h->fref1 : h->fref0;
417 int i_ref = i ? h->i_ref1 : h->i_ref0;
418 for( j=0; j<i_ref; j++ )
420 x264_frame_cond_wait( fref[j]->orig, thresh );
421 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
425 if( h->param.b_deterministic )
426 thread_mvy_range = h->param.analyse.i_mv_range_thread;
427 if( h->mb.b_interlaced )
428 thread_mvy_range >>= 1;
430 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
433 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
434 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
435 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
436 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
437 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
438 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
439 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
447 a->l0.i_cost8x16 = COST_MAX;
448 if( h->sh.i_type == SLICE_TYPE_B )
453 a->i_cost8x8direct[0] =
454 a->i_cost8x8direct[1] =
455 a->i_cost8x8direct[2] =
456 a->i_cost8x8direct[3] =
465 a->i_cost16x16direct =
468 a->i_cost8x16bi = COST_MAX;
470 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
471 for( i = 0; i < 4; i++ )
475 a->l0.i_cost4x8[i] = COST_MAX;
478 /* Fast intra decision */
479 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
481 if( IS_INTRA( h->mb.i_mb_type_left )
482 || IS_INTRA( h->mb.i_mb_type_top )
483 || IS_INTRA( h->mb.i_mb_type_topleft )
484 || IS_INTRA( h->mb.i_mb_type_topright )
485 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
486 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
487 { /* intra is likely */ }
494 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
495 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
497 a->b_force_intra = 1;
501 a->b_force_intra = 0;
505 /* Prediction modes allowed for various combinations of neighbors. */
506 /* Terminated by a -1. */
507 /* In order, no neighbors, left, top, top/left, top/left/topleft */
508 static const int8_t i16x16_mode_available[5][5] =
510 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
511 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
512 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
513 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
514 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
517 static const int8_t i8x8chroma_mode_available[5][5] =
519 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
520 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
521 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
522 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
523 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
526 static const int8_t i4x4_mode_available[5][10] =
528 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
529 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
530 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
531 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
532 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
535 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
537 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
538 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
541 static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
543 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
544 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
547 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour )
549 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
550 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
553 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
554 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
556 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
558 if( do_both_dct || h->mb.b_transform_8x8 )
559 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
560 if( do_both_dct || !h->mb.b_transform_8x8 )
561 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
564 /* Reset fenc satd scores cache for psy RD */
565 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
567 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
568 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
569 if( !h->mb.i_psy_rd )
571 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
572 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
574 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
577 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
579 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
581 if( a->i_satd_i8x8chroma < COST_MAX )
584 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
586 /* 8x8 prediction selection for chroma */
587 if( predict_mode[3] >= 0 && b_merged_satd )
589 int satdu[4], satdv[4];
590 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
591 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
592 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
593 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
594 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
595 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
597 for( ; *predict_mode >= 0; predict_mode++ )
599 int i_mode = *predict_mode;
600 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
602 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
603 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
608 for( ; *predict_mode >= 0; predict_mode++ )
611 int i_mode = *predict_mode;
613 /* we do the prediction */
614 if( h->mb.b_lossless )
615 x264_predict_lossless_8x8_chroma( h, i_mode );
618 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
619 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
622 /* we calculate the cost */
623 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
624 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
625 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
627 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
628 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
632 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
635 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
637 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
638 uint8_t *p_src = h->mb.pic.p_fenc[0];
639 uint8_t *p_dst = h->mb.pic.p_fdec[0];
642 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
644 /*---------------- Try all mode and calculate their score ---------------*/
646 /* 16x16 prediction selection */
647 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
649 if( b_merged_satd && predict_mode[3] >= 0 )
651 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
652 h->predict_16x16[I_PRED_16x16_P]( p_dst );
653 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
654 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
657 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
658 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
663 for( ; *predict_mode >= 0; predict_mode++ )
666 int i_mode = *predict_mode;
668 if( h->mb.b_lossless )
669 x264_predict_lossless_16x16( h, i_mode );
671 h->predict_16x16[i_mode]( p_dst );
673 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
674 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
675 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
676 a->i_satd_i16x16_dir[i_mode] = i_satd;
680 if( h->sh.i_type == SLICE_TYPE_B )
681 /* cavlc mb type prefix */
682 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
683 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
686 /* 8x8 prediction selection */
687 if( flags & X264_ANALYSE_I8x8 )
689 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
690 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
691 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
693 h->mb.i_cbp_luma = 0;
694 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
696 // FIXME some bias like in i4x4?
697 if( h->sh.i_type == SLICE_TYPE_B )
698 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
700 for( idx = 0;; idx++ )
704 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
705 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
706 int i_best = COST_MAX;
707 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
709 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
710 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
712 if( b_merged_satd && predict_mode[8] >= 0 )
715 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
716 satd[i_pred_mode] -= 3 * a->i_lambda;
717 for( i=2; i>=0; i-- )
719 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
720 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
725 for( ; *predict_mode >= 0; predict_mode++ )
728 int i_mode = *predict_mode;
730 if( h->mb.b_lossless )
731 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
733 h->predict_8x8[i_mode]( p_dst_by, edge );
735 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
736 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
737 i_satd -= a->i_lambda * 3;
739 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
740 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
744 if( idx == 3 || i_cost > i_satd_thresh )
747 /* we need to encode this block now (for next ones) */
748 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
749 x264_mb_encode_i8x8( h, idx, a->i_qp );
751 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
756 a->i_satd_i8x8 = i_cost;
757 if( h->mb.i_skip_intra )
759 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
760 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
761 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
762 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
763 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
764 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
765 if( h->mb.i_skip_intra == 2 )
766 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
771 static const uint16_t cost_div_fix8[3] = {1024,512,341};
772 a->i_satd_i8x8 = COST_MAX;
773 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
775 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
779 /* 4x4 prediction selection */
780 if( flags & X264_ANALYSE_I4x4 )
783 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
784 h->mb.i_cbp_luma = 0;
785 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
787 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
789 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
790 if( h->sh.i_type == SLICE_TYPE_B )
791 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
793 for( idx = 0;; idx++ )
795 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
796 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
797 int i_best = COST_MAX;
798 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
800 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
802 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
803 /* emulate missing topright samples */
804 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
806 if( b_merged_satd && predict_mode[5] >= 0 )
809 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
810 satd[i_pred_mode] -= 3 * a->i_lambda;
811 for( i=2; i>=0; i-- )
812 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
816 for( ; *predict_mode >= 0; predict_mode++ )
819 int i_mode = *predict_mode;
821 if( h->mb.b_lossless )
822 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
824 h->predict_4x4[i_mode]( p_dst_by );
826 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
827 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
828 i_satd -= a->i_lambda * 3;
830 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
832 i_cost += i_best + 4 * a->i_lambda;
834 if( i_cost > i_satd_thresh || idx == 15 )
837 /* we need to encode this block now (for next ones) */
838 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
839 x264_mb_encode_i4x4( h, idx, a->i_qp );
841 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
845 a->i_satd_i4x4 = i_cost;
846 if( h->mb.i_skip_intra )
848 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
849 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
850 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
851 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
852 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
853 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
854 if( h->mb.i_skip_intra == 2 )
855 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
859 a->i_satd_i4x4 = COST_MAX;
863 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
865 if( a->i_satd_i16x16 <= i_satd_thresh )
867 h->mb.i_type = I_16x16;
868 x264_analyse_update_cache( h, a );
869 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
872 a->i_satd_i16x16 = COST_MAX;
874 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
876 h->mb.i_type = I_4x4;
877 x264_analyse_update_cache( h, a );
878 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
881 a->i_satd_i4x4 = COST_MAX;
883 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
885 h->mb.i_type = I_8x8;
886 x264_analyse_update_cache( h, a );
887 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
888 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
891 a->i_satd_i8x8 = COST_MAX;
894 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
896 uint8_t *p_dst = h->mb.pic.p_fdec[0];
899 int i_mode, i_thresh;
900 uint64_t i_satd, i_best;
901 h->mb.i_skip_intra = 0;
903 if( h->mb.i_type == I_16x16 )
905 int old_pred_mode = a->i_predict16x16;
906 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
907 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
908 i_best = a->i_satd_i16x16;
909 for( ; *predict_mode >= 0; predict_mode++ )
911 int i_mode = *predict_mode;
912 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
914 h->mb.i_intra16x16_pred_mode = i_mode;
915 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
916 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
920 /* RD selection for chroma prediction */
921 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
922 if( predict_mode[1] >= 0 )
924 int8_t predict_mode_sorted[4];
926 i_thresh = a->i_satd_i8x8chroma * 5/4;
928 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
930 i_mode = *predict_mode;
931 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
932 predict_mode_sorted[i_max++] = i_mode;
937 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
938 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
939 /* the previous thing encoded was x264_intra_rd(), so the pixels and
940 * coefs for the current chroma mode are still around, so we only
941 * have to recount the bits. */
942 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
943 for( i = 0; i < i_max; i++ )
945 i_mode = predict_mode_sorted[i];
946 if( h->mb.b_lossless )
947 x264_predict_lossless_8x8_chroma( h, i_mode );
950 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
951 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
953 /* if we've already found a mode that needs no residual, then
954 * probably any mode with a residual will be worse.
955 * so avoid dct on the remaining modes to improve speed. */
956 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
957 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
959 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
960 h->mb.i_cbp_chroma = i_cbp_chroma_best;
964 if( h->mb.i_type == I_4x4 )
966 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
968 for( idx = 0; idx < 16; idx++ )
970 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
973 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
975 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
976 /* emulate missing topright samples */
977 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
979 for( ; *predict_mode >= 0; predict_mode++ )
981 i_mode = *predict_mode;
982 if( h->mb.b_lossless )
983 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
985 h->predict_4x4[i_mode]( p_dst_by );
986 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
988 if( i_best > i_satd )
990 a->i_predict4x4[idx] = i_mode;
992 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
993 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
994 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
995 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
996 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1000 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1001 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1002 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1003 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1004 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1006 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1009 else if( h->mb.i_type == I_8x8 )
1011 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1012 for( idx = 0; idx < 4; idx++ )
1014 uint64_t pels_h = 0;
1016 uint16_t i_nnz[2] = {0}; //shut up gcc
1019 int cbp_luma_new = 0;
1020 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1022 i_best = COST_MAX64;
1026 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1027 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1028 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1030 for( ; *predict_mode >= 0; predict_mode++ )
1032 i_mode = *predict_mode;
1033 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1036 if( h->mb.b_lossless )
1037 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1039 h->predict_8x8[i_mode]( p_dst_by, edge );
1040 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1041 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1043 if( i_best > i_satd )
1045 a->i_predict8x8[idx] = i_mode;
1046 cbp_luma_new = h->mb.i_cbp_luma;
1049 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1051 for( j=0; j<7; j++ )
1052 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1053 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1054 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1057 a->i_cbp_i8x8_luma = cbp_luma_new;
1058 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1060 for( j=0; j<7; j++ )
1061 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1062 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1063 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1065 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1070 #define LOAD_FENC( m, src, xoff, yoff) \
1071 (m)->p_cost_mv = a->p_cost_mv; \
1072 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1073 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1074 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1075 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1076 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1078 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1079 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1080 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1081 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1082 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1083 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1084 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1085 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1086 (m)->weight = weight_none; \
1089 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1090 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1091 (m)->weight = h->sh.weight[i_ref];
1093 #define REF_COST(list, ref) \
1094 (a->p_cost_ref[list][ref])
1096 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1100 ALIGNED_4( int16_t mvc[8][2] );
1101 int i_halfpel_thresh = INT_MAX;
1102 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1104 /* 16x16 Search on all ref frame */
1105 m.i_pixel = PIXEL_16x16;
1106 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1108 a->l0.me16x16.cost = INT_MAX;
1109 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1111 m.i_ref_cost = REF_COST( 0, i_ref );
1112 i_halfpel_thresh -= m.i_ref_cost;
1114 /* search with ref */
1115 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1116 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1118 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1119 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1121 if( h->mb.ref_blind_dupe == i_ref )
1123 CP32( m.mv, a->l0.mvc[0][0] );
1124 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1127 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1129 /* save mv for predicting neighbors */
1130 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1131 CP32( a->l0.mvc[i_ref][0], m.mv );
1133 /* early termination
1134 * SSD threshold would probably be better than SATD */
1137 && m.cost-m.cost_mv < 300*a->i_lambda
1138 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1139 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1140 && x264_macroblock_probe_pskip( h ) )
1142 h->mb.i_type = P_SKIP;
1143 x264_analyse_update_cache( h, a );
1144 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1148 m.cost += m.i_ref_cost;
1149 i_halfpel_thresh += m.i_ref_cost;
1151 if( m.cost < a->l0.me16x16.cost )
1152 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1155 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1156 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1158 h->mb.i_type = P_L0;
1161 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1162 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1164 h->mb.i_partition = D_16x16;
1165 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1166 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1167 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1168 h->mb.i_type = P_SKIP;
1173 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1177 uint8_t **p_fenc = h->mb.pic.p_fenc;
1178 int i_maxref = h->mb.pic.i_fref[0]-1;
1180 h->mb.i_partition = D_8x8;
1182 #define CHECK_NEIGHBOUR(i)\
1184 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1185 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1189 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1190 * than those used by the neighbors */
1191 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1192 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1195 CHECK_NEIGHBOUR( -8 - 1 );
1196 CHECK_NEIGHBOUR( -8 + 0 );
1197 CHECK_NEIGHBOUR( -8 + 2 );
1198 CHECK_NEIGHBOUR( -8 + 4 );
1199 CHECK_NEIGHBOUR( 0 - 1 );
1200 CHECK_NEIGHBOUR( 2*8 - 1 );
1202 #undef CHECK_NEIGHBOUR
1204 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1205 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1207 for( i = 0; i < 4; i++ )
1209 x264_me_t *l0m = &a->l0.me8x8[i];
1213 m.i_pixel = PIXEL_8x8;
1215 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1216 l0m->cost = INT_MAX;
1217 for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1219 m.i_ref_cost = REF_COST( 0, i_ref );
1221 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1222 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1224 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1225 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1226 if( h->mb.ref_blind_dupe == i_ref )
1228 CP32( m.mv, a->l0.mvc[0][i+1] );
1229 x264_me_refine_qpel_refdupe( h, &m, NULL );
1232 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1234 m.cost += m.i_ref_cost;
1236 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1238 if( m.cost < l0m->cost )
1239 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1240 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1241 i_ref = h->mb.ref_blind_dupe;
1245 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1246 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1248 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1249 are effectively zero. */
1250 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1251 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1254 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1255 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1256 /* P_8x8 ref0 has no ref cost */
1257 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1258 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1259 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1260 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1261 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1264 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1266 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1267 * reference frame flags. Thus, if we're not doing mixedrefs, just
1268 * don't bother analysing the dupes. */
1269 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1270 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1271 uint8_t **p_fenc = h->mb.pic.p_fenc;
1273 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1276 /* XXX Needed for x264_mb_predict_mv */
1277 h->mb.i_partition = D_8x8;
1280 CP32( mvc[0], a->l0.me16x16.mv );
1282 for( i = 0; i < 4; i++ )
1284 x264_me_t *m = &a->l0.me8x8[i];
1288 m->i_pixel = PIXEL_8x8;
1289 m->i_ref_cost = i_ref_cost;
1291 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1292 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1293 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1295 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1296 x264_me_search( h, m, mvc, i_mvc );
1298 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1300 CP32( mvc[i_mvc], m->mv );
1304 m->cost += i_ref_cost;
1305 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1306 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1309 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1310 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1311 /* theoretically this should include 4*ref_cost,
1312 * but 3 seems a better approximation of cabac. */
1313 if( h->param.b_cabac )
1314 a->l0.i_cost8x8 -= i_ref_cost;
1315 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1316 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1319 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1322 uint8_t **p_fenc = h->mb.pic.p_fenc;
1323 ALIGNED_4( int16_t mvc[3][2] );
1326 /* XXX Needed for x264_mb_predict_mv */
1327 h->mb.i_partition = D_16x8;
1329 for( i = 0; i < 2; i++ )
1331 x264_me_t *l0m = &a->l0.me16x8[i];
1332 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1333 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1334 const int ref8[2] = { minref, maxref };
1335 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1337 m.i_pixel = PIXEL_16x8;
1339 LOAD_FENC( &m, p_fenc, 0, 8*i );
1340 l0m->cost = INT_MAX;
1341 for( j = 0; j < i_ref8s; j++ )
1343 const int i_ref = ref8[j];
1344 m.i_ref_cost = REF_COST( 0, i_ref );
1346 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1347 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1348 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1349 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1351 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1352 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1354 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1355 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1356 /* We can only take this shortcut if the first search was performed on ref0. */
1357 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1359 /* We can just leave the MV from the previous ref search. */
1360 x264_me_refine_qpel_refdupe( h, &m, NULL );
1363 x264_me_search( h, &m, mvc, 3 );
1365 m.cost += m.i_ref_cost;
1367 if( m.cost < l0m->cost )
1368 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1370 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1371 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1374 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1377 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1380 uint8_t **p_fenc = h->mb.pic.p_fenc;
1381 ALIGNED_4( int16_t mvc[3][2] );
1384 /* XXX Needed for x264_mb_predict_mv */
1385 h->mb.i_partition = D_8x16;
1387 for( i = 0; i < 2; i++ )
1389 x264_me_t *l0m = &a->l0.me8x16[i];
1390 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1391 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1392 const int ref8[2] = { minref, maxref };
1393 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1395 m.i_pixel = PIXEL_8x16;
1397 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1398 l0m->cost = INT_MAX;
1399 for( j = 0; j < i_ref8s; j++ )
1401 const int i_ref = ref8[j];
1402 m.i_ref_cost = REF_COST( 0, i_ref );
1404 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1405 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1406 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1408 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1409 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1411 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1412 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1413 /* We can only take this shortcut if the first search was performed on ref0. */
1414 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1416 /* We can just leave the MV from the previous ref search. */
1417 x264_me_refine_qpel_refdupe( h, &m, NULL );
1420 x264_me_search( h, &m, mvc, 3 );
1422 m.cost += m.i_ref_cost;
1424 if( m.cost < l0m->cost )
1425 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1427 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1428 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1431 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1434 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1436 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1437 uint8_t *pix2 = pix1+8;
1438 const int i_stride = h->mb.pic.i_stride[1];
1439 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1440 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1441 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1442 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1443 x264_weight_t *weight = h->sh.weight[i_ref];
1445 #define CHROMA4x4MC( width, height, me, x, y ) \
1446 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1447 if( weight[1].weightfn ) \
1448 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1449 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1450 if( weight[2].weightfn ) \
1451 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1454 if( pixel == PIXEL_4x4 )
1456 x264_me_t *m = a->l0.me4x4[i8x8];
1457 CHROMA4x4MC( 2,2, m[0], 0,0 );
1458 CHROMA4x4MC( 2,2, m[1], 2,0 );
1459 CHROMA4x4MC( 2,2, m[2], 0,2 );
1460 CHROMA4x4MC( 2,2, m[3], 2,2 );
1462 else if( pixel == PIXEL_8x4 )
1464 x264_me_t *m = a->l0.me8x4[i8x8];
1465 CHROMA4x4MC( 4,2, m[0], 0,0 );
1466 CHROMA4x4MC( 4,2, m[1], 0,2 );
1470 x264_me_t *m = a->l0.me4x8[i8x8];
1471 CHROMA4x4MC( 2,4, m[0], 0,0 );
1472 CHROMA4x4MC( 2,4, m[1], 2,0 );
1475 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1476 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1479 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1481 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1482 uint8_t **p_fenc = h->mb.pic.p_fenc;
1483 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1486 /* XXX Needed for x264_mb_predict_mv */
1487 h->mb.i_partition = D_8x8;
1489 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1491 const int idx = 4*i8x8 + i4x4;
1492 const int x4 = block_idx_x[idx];
1493 const int y4 = block_idx_y[idx];
1494 const int i_mvc = (i4x4 == 0);
1496 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1498 m->i_pixel = PIXEL_4x4;
1500 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1501 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1502 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1504 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1505 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1507 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1509 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1510 a->l0.me4x4[i8x8][1].cost +
1511 a->l0.me4x4[i8x8][2].cost +
1512 a->l0.me4x4[i8x8][3].cost +
1513 REF_COST( 0, i_ref ) +
1514 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1515 if( h->mb.b_chroma_me )
1516 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1519 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1521 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1522 uint8_t **p_fenc = h->mb.pic.p_fenc;
1523 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1526 /* XXX Needed for x264_mb_predict_mv */
1527 h->mb.i_partition = D_8x8;
1529 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1531 const int idx = 4*i8x8 + 2*i8x4;
1532 const int x4 = block_idx_x[idx];
1533 const int y4 = block_idx_y[idx];
1534 const int i_mvc = (i8x4 == 0);
1536 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1538 m->i_pixel = PIXEL_8x4;
1540 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1541 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1542 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1544 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1545 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1547 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1549 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1550 REF_COST( 0, i_ref ) +
1551 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1552 if( h->mb.b_chroma_me )
1553 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1556 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1558 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1559 uint8_t **p_fenc = h->mb.pic.p_fenc;
1560 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1563 /* XXX Needed for x264_mb_predict_mv */
1564 h->mb.i_partition = D_8x8;
1566 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1568 const int idx = 4*i8x8 + i4x8;
1569 const int x4 = block_idx_x[idx];
1570 const int y4 = block_idx_y[idx];
1571 const int i_mvc = (i4x8 == 0);
1573 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1575 m->i_pixel = PIXEL_4x8;
1577 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1578 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1579 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1581 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1582 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1584 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1586 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1587 REF_COST( 0, i_ref ) +
1588 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1589 if( h->mb.b_chroma_me )
1590 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1593 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1595 /* Assumes that fdec still contains the results of
1596 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1598 uint8_t **p_fenc = h->mb.pic.p_fenc;
1599 uint8_t **p_fdec = h->mb.pic.p_fdec;
1602 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1603 for( i = 0; i < 4; i++ )
1605 const int x = (i&1)*8;
1606 const int y = (i>>1)*8;
1607 a->i_cost16x16direct +=
1608 a->i_cost8x8direct[i] =
1609 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1612 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1616 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1618 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1619 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1620 uint8_t *src0, *src1;
1621 int stride0 = 16, stride1 = 16;
1622 int i_ref, i_mvc, l;
1623 ALIGNED_4( int16_t mvc[9][2] );
1624 int try_skip = a->b_try_skip;
1625 int list1_skipped = 0;
1626 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1627 int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL,
1628 h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL};
1631 m.i_pixel = PIXEL_16x16;
1633 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1635 /* 16x16 Search on list 0 and list 1 */
1636 a->l0.me16x16.cost = INT_MAX;
1637 a->l1.me16x16.cost = INT_MAX;
1638 for( l = 1; l >= 0; )
1640 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1642 /* This loop is extremely munged in order to facilitate the following order of operations,
1643 * necessary for an efficient fast skip.
1644 * 1. Search list1 ref0.
1645 * 2. Search list0 ref0.
1647 * 4. Search the rest of list0.
1648 * 5. Go back and finish list1.
1650 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1652 if( try_skip && l == 1 && i_ref > 0 )
1658 m.i_ref_cost = REF_COST( l, i_ref );
1660 /* search with ref */
1661 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1662 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1663 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1664 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
1667 m.cost += m.i_ref_cost;
1669 if( m.cost < lX->me16x16.cost )
1670 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1672 /* save mv for predicting neighbors */
1673 CP32( lX->mvc[i_ref][0], m.mv );
1674 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1676 /* Fast skip detection. */
1677 if( i_ref == 0 && try_skip )
1679 if( abs(lX->bi16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
1680 abs(lX->bi16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
1686 /* We already tested skip */
1687 h->mb.i_type = B_SKIP;
1688 x264_analyse_update_cache( h, a );
1693 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
1695 if( list1_skipped && l == 0 )
1701 /* get cost of BI mode */
1702 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1703 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1704 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1705 src0 = h->mc.get_ref( pix0, &stride0,
1706 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1707 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1708 src1 = h->mc.get_ref( pix1, &stride1,
1709 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1710 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1712 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1714 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1716 + a->l0.bi16x16.cost_mv
1717 + a->l1.bi16x16.cost_mv;
1719 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1720 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1722 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1723 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1724 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1725 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1726 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1727 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1728 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1729 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1730 + ref_costs + l0_mv_cost + l1_mv_cost;
1731 if( cost00 < a->i_cost16x16bi )
1733 M32( a->l0.bi16x16.mv ) = 0;
1734 M32( a->l1.bi16x16.mv ) = 0;
1735 a->l0.bi16x16.cost_mv = l0_mv_cost;
1736 a->l1.bi16x16.cost_mv = l1_mv_cost;
1737 a->i_cost16x16bi = cost00;
1742 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1743 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1744 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1747 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1749 const int x = 2*(i%2);
1750 const int y = 2*(i/2);
1752 switch( h->mb.i_sub_partition[i] )
1755 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1758 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1759 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1762 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1763 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1766 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1767 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1768 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1769 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1772 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1777 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1779 const int x = 2*(idx&1);
1780 const int y = 2*(idx>>1);
1781 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1782 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1783 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1784 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1787 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1788 if( x264_mb_partition_listX_table[0][part] ) \
1790 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1791 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1795 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1796 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1798 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1800 if( x264_mb_partition_listX_table[1][part] ) \
1802 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1803 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1807 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1808 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1810 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1813 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1817 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1819 x264_mb_load_mv_direct8x8( h, i );
1822 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1823 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1824 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1829 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1832 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1834 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1836 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1838 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1842 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1844 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1846 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1848 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1849 * than those used by the neighbors */
1850 #define CHECK_NEIGHBOUR(i)\
1852 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1853 if( ref > i_maxref[l] )\
1857 for( l = 0; l < 2; l++ )
1859 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1860 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1861 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1864 CHECK_NEIGHBOUR( -8 - 1 );
1865 CHECK_NEIGHBOUR( -8 + 0 );
1866 CHECK_NEIGHBOUR( -8 + 2 );
1867 CHECK_NEIGHBOUR( -8 + 4 );
1868 CHECK_NEIGHBOUR( 0 - 1 );
1869 CHECK_NEIGHBOUR( 2*8 - 1 );
1873 /* XXX Needed for x264_mb_predict_mv */
1874 h->mb.i_partition = D_8x8;
1878 for( i = 0; i < 4; i++ )
1884 int stride[2] = {8,8};
1887 m.i_pixel = PIXEL_8x8;
1888 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1890 for( l = 0; l < 2; l++ )
1892 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1894 lX->me8x8[i].cost = INT_MAX;
1895 for( i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1897 m.i_ref_cost = REF_COST( l, i_ref );;
1899 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1901 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1902 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1903 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1904 m.cost += m.i_ref_cost;
1906 if( m.cost < lX->me8x8[i].cost )
1907 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1909 /* save mv for predicting other partitions within this MB */
1910 CP32( lX->mvc[i_ref][i+1], m.mv );
1915 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1916 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1917 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1918 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1919 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1920 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1922 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1923 + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv + a->l0.me8x8[i].i_ref_cost
1924 + a->l1.me8x8[i].i_ref_cost + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1926 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1927 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1929 i_part_cost = a->l0.me8x8[i].cost;
1930 h->mb.i_sub_partition[i] = D_L0_8x8;
1931 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1932 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1933 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1934 a->i_cost8x8bi += i_part_cost;
1936 /* XXX Needed for x264_mb_predict_mv */
1937 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1941 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1944 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1946 uint8_t **p_fref[2] =
1947 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
1948 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
1949 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1952 /* XXX Needed for x264_mb_predict_mv */
1953 h->mb.i_partition = D_8x8;
1957 for( i = 0; i < 4; i++ )
1962 int i_part_cost_bi = 0;
1963 int stride[2] = {8,8};
1966 for( l = 0; l < 2; l++ )
1968 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1969 x264_me_t *m = &lX->me8x8[i];
1970 m->i_pixel = PIXEL_8x8;
1971 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1973 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
1974 m->i_ref = lX->me16x16.i_ref;
1976 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
1978 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
1979 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1980 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1981 m->cost += m->i_ref_cost;
1983 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1985 /* save mv for predicting other partitions within this MB */
1986 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
1989 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1990 m->mv[0], m->mv[1], 8, 8, weight_none );
1991 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
1993 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
1994 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1995 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1996 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1997 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1999 i_part_cost = a->l0.me8x8[i].cost;
2000 h->mb.i_sub_partition[i] = D_L0_8x8;
2001 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2002 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2003 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2004 a->i_cost8x8bi += i_part_cost;
2006 /* XXX Needed for x264_mb_predict_mv */
2007 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2011 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2014 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
2016 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
2017 ALIGNED_4( int16_t mvc[3][2] );
2020 h->mb.i_partition = D_16x8;
2021 a->i_cost16x8bi = 0;
2023 for( i = 0; i < 2; i++ )
2026 int i_part_cost_bi = 0;
2027 int stride[2] = {16,16};
2030 m.i_pixel = PIXEL_16x8;
2031 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2033 for( l = 0; l < 2; l++ )
2035 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2036 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2037 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2038 lX->me16x8[i].cost = INT_MAX;
2039 for( j = 0; j < i_ref8s; j++ )
2042 m.i_ref_cost = REF_COST( l, i_ref );;
2044 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2046 CP32( mvc[0], lX->mvc[i_ref][0] );
2047 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2048 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2050 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2051 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2052 x264_me_search( h, &m, mvc, 3 );
2053 m.cost += m.i_ref_cost;
2055 if( m.cost < lX->me16x8[i].cost )
2056 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2061 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2062 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2063 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2064 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2065 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2066 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2068 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2069 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2070 + a->l1.me16x8[i].i_ref_cost;
2072 i_part_cost = a->l0.me16x8[i].cost;
2073 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2075 if( a->l1.me16x8[i].cost < i_part_cost )
2077 i_part_cost = a->l1.me16x8[i].cost;
2078 a->i_mb_partition16x8[i] = D_L1_8x8;
2080 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2082 i_part_cost = i_part_cost_bi;
2083 a->i_mb_partition16x8[i] = D_BI_8x8;
2085 a->i_cost16x8bi += i_part_cost;
2087 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2091 a->i_mb_type16x8 = B_L0_L0
2092 + (a->i_mb_partition16x8[0]>>2) * 3
2093 + (a->i_mb_partition16x8[1]>>2);
2094 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2097 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2099 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2100 ALIGNED_4( int16_t mvc[2][2] );
2103 h->mb.i_partition = D_8x16;
2104 a->i_cost8x16bi = 0;
2106 for( i = 0; i < 2; i++ )
2109 int i_part_cost_bi = 0;
2110 int stride[2] = {8,8};
2113 m.i_pixel = PIXEL_8x16;
2114 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2116 for( l = 0; l < 2; l++ )
2118 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2119 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2120 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2121 lX->me8x16[i].cost = INT_MAX;
2122 for( j = 0; j < i_ref8s; j++ )
2125 m.i_ref_cost = REF_COST( l, i_ref );
2127 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2129 CP32( mvc[0], lX->mvc[i_ref][0] );
2130 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2131 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2133 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2134 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2135 x264_me_search( h, &m, mvc, 3 );
2136 m.cost += m.i_ref_cost;
2138 if( m.cost < lX->me8x16[i].cost )
2139 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2144 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2145 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2146 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2147 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2148 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2150 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2151 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2152 + a->l1.me8x16[i].i_ref_cost;
2154 i_part_cost = a->l0.me8x16[i].cost;
2155 a->i_mb_partition8x16[i] = D_L0_8x8;
2157 if( a->l1.me8x16[i].cost < i_part_cost )
2159 i_part_cost = a->l1.me8x16[i].cost;
2160 a->i_mb_partition8x16[i] = D_L1_8x8;
2162 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2164 i_part_cost = i_part_cost_bi;
2165 a->i_mb_partition8x16[i] = D_BI_8x8;
2167 a->i_cost8x16bi += i_part_cost;
2169 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2173 a->i_mb_type8x16 = B_L0_L0
2174 + (a->i_mb_partition8x16[0]>>2) * 3
2175 + (a->i_mb_partition8x16[1]>>2);
2176 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2179 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2181 int thresh = i_satd * 5/4;
2183 h->mb.i_type = P_L0;
2184 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2186 h->mb.i_partition = D_16x16;
2187 x264_analyse_update_cache( h, a );
2188 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2191 if( a->l0.i_cost16x8 <= thresh )
2193 h->mb.i_partition = D_16x8;
2194 x264_analyse_update_cache( h, a );
2195 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2198 a->l0.i_cost16x8 = COST_MAX;
2200 if( a->l0.i_cost8x16 <= thresh )
2202 h->mb.i_partition = D_8x16;
2203 x264_analyse_update_cache( h, a );
2204 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2207 a->l0.i_cost8x16 = COST_MAX;
2209 if( a->l0.i_cost8x8 <= thresh )
2211 h->mb.i_type = P_8x8;
2212 h->mb.i_partition = D_8x8;
2213 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2216 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2217 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2218 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2219 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2220 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2221 * for future blocks are those left over from previous RDO calls. */
2222 for( i = 0; i < 4; i++ )
2224 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2225 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2226 int subtype, btype = D_L0_8x8;
2227 uint64_t bcost = COST_MAX64;
2228 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2231 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2233 h->mb.i_sub_partition[i] = subtype;
2234 x264_mb_cache_mv_p8x8( h, a, i );
2235 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2236 COPY2_IF_LT( bcost, cost, btype, subtype );
2238 if( h->mb.i_sub_partition[i] != btype )
2240 h->mb.i_sub_partition[i] = btype;
2241 x264_mb_cache_mv_p8x8( h, a, i );
2246 x264_analyse_update_cache( h, a );
2247 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2250 a->l0.i_cost8x8 = COST_MAX;
2253 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2255 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2257 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2259 h->mb.i_type = B_DIRECT;
2260 /* Assumes direct/skip MC is still in fdec */
2261 /* Requires b-rdo to be done before intra analysis */
2262 h->mb.b_skip_mc = 1;
2263 x264_analyse_update_cache( h, a );
2264 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2265 h->mb.b_skip_mc = 0;
2268 //FIXME not all the update_cache calls are needed
2269 h->mb.i_partition = D_16x16;
2271 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2273 h->mb.i_type = B_L0_L0;
2274 x264_analyse_update_cache( h, a );
2275 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2279 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2281 h->mb.i_type = B_L1_L1;
2282 x264_analyse_update_cache( h, a );
2283 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2287 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2289 h->mb.i_type = B_BI_BI;
2290 x264_analyse_update_cache( h, a );
2291 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2295 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2297 h->mb.i_type = B_8x8;
2298 h->mb.i_partition = D_8x8;
2299 x264_analyse_update_cache( h, a );
2300 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2301 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2305 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2307 h->mb.i_type = a->i_mb_type16x8;
2308 h->mb.i_partition = D_16x8;
2309 x264_analyse_update_cache( h, a );
2310 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2314 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2316 h->mb.i_type = a->i_mb_type8x16;
2317 h->mb.i_partition = D_8x16;
2318 x264_analyse_update_cache( h, a );
2319 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2323 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2328 if( IS_INTRA(h->mb.i_type) )
2331 switch( h->mb.i_partition )
2334 if( h->mb.i_type == B_BI_BI )
2336 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2337 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2341 for( i=0; i<2; i++ )
2342 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2344 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2345 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2349 for( i=0; i<2; i++ )
2350 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2352 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2353 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2357 for( i=0; i<4; i++ )
2358 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2360 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2361 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2367 static inline void x264_mb_analyse_transform( x264_t *h )
2369 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2371 int i_cost4, i_cost8;
2372 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2375 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2376 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2377 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2378 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2380 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2381 h->mb.b_skip_mc = 1;
2385 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2387 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2390 x264_analyse_update_cache( h, a );
2391 h->mb.b_transform_8x8 ^= 1;
2392 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2393 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2395 if( *i_rd >= i_rd8 )
2398 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2402 h->mb.b_transform_8x8 ^= 1;
2406 /* Rate-distortion optimal QP selection.
2407 * FIXME: More than half of the benefit of this function seems to be
2408 * in the way it improves the coding of chroma DC (by decimating or
2409 * finding a better way to code a single DC coefficient.)
2410 * There must be a more efficient way to get that portion of the benefit
2411 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2413 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2415 int bcost, cost, direction, failures, prevcost, origcost;
2416 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2417 int last_qp_tried = 0;
2418 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2419 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2421 /* If CBP is already zero, don't raise the quantizer any higher. */
2422 for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2424 /* Without psy-RD, require monotonicity when moving quant away from previous
2425 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2426 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2427 * allow 2 failures when moving quant towards previous quant.
2428 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2429 int threshold = (!!h->mb.i_psy_rd);
2430 /* Raise the threshold for failures if we're moving towards the last QP. */
2431 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2432 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2434 h->mb.i_qp = orig_qp;
2436 prevcost = origcost;
2438 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2439 * (up to a point) will too. So, jump down to where the threshold will kick in
2440 * and check the QP there. If the CBP is still empty, skip the main loop.
2441 * If it isn't empty, we would have ended up having to check this QP anyways,
2442 * so as long as we store it for later lookup, we lose nothing. */
2443 int already_checked_qp = -1;
2444 int already_checked_cost = COST_MAX;
2445 if( direction == -1 )
2449 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2450 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2451 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2452 if( !h->mb.cbp[h->mb.i_mb_xy] )
2454 /* If our empty-CBP block is lower QP than the last QP,
2455 * the last QP almost surely doesn't have a CBP either. */
2456 if( h->mb.i_last_qp > h->mb.i_qp )
2460 already_checked_qp = h->mb.i_qp;
2461 h->mb.i_qp = orig_qp;
2465 h->mb.i_qp += direction;
2466 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2468 if( h->mb.i_last_qp == h->mb.i_qp )
2470 if( h->mb.i_qp == already_checked_qp )
2471 cost = already_checked_cost;
2474 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2475 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2476 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2479 /* We can't assume that the costs are monotonic over QPs.
2480 * Tie case-as-failure seems to give better results. */
2481 if( cost < prevcost )
2487 if( failures > threshold )
2489 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2491 h->mb.i_qp += direction;
2495 /* Always try the last block's QP. */
2496 if( !last_qp_tried )
2498 h->mb.i_qp = h->mb.i_last_qp;
2499 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2500 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2501 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2505 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2507 /* Check transform again; decision from before may no longer be optimal. */
2508 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2509 x264_mb_transform_8x8_allowed( h ) )
2511 h->mb.b_transform_8x8 ^= 1;
2512 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2514 h->mb.b_transform_8x8 ^= 1;
2518 /*****************************************************************************
2519 * x264_macroblock_analyse:
2520 *****************************************************************************/
2521 void x264_macroblock_analyse( x264_t *h )
2523 x264_mb_analysis_t analysis;
2524 int i_cost = COST_MAX;
2527 h->mb.i_qp = x264_ratecontrol_qp( h );
2528 if( h->param.rc.i_aq_mode )
2530 x264_adaptive_quant( h );
2531 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2532 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2533 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2534 h->mb.i_qp = h->mb.i_last_qp;
2537 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2539 /*--------------------------- Do the analysis ---------------------------*/
2540 if( h->sh.i_type == SLICE_TYPE_I )
2543 if( analysis.i_mbrd )
2544 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2545 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2546 if( analysis.i_mbrd )
2547 x264_intra_rd( h, &analysis, COST_MAX );
2549 i_cost = analysis.i_satd_i16x16;
2550 h->mb.i_type = I_16x16;
2551 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2552 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2553 if( analysis.i_satd_pcm < i_cost )
2554 h->mb.i_type = I_PCM;
2556 else if( analysis.i_mbrd >= 2 )
2557 x264_intra_rd_refine( h, &analysis );
2559 else if( h->sh.i_type == SLICE_TYPE_P )
2563 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2565 analysis.b_try_skip = 0;
2566 if( analysis.b_force_intra )
2568 if( !h->param.analyse.b_psy )
2570 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2571 goto intra_analysis;
2576 /* Fast P_SKIP detection */
2577 if( h->param.analyse.b_fast_pskip )
2579 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2580 // FIXME don't need to check this if the reference frame is done
2582 else if( h->param.analyse.i_subpel_refine >= 3 )
2583 analysis.b_try_skip = 1;
2584 else if( h->mb.i_mb_type_left == P_SKIP ||
2585 h->mb.i_mb_type_top == P_SKIP ||
2586 h->mb.i_mb_type_topleft == P_SKIP ||
2587 h->mb.i_mb_type_topright == P_SKIP )
2588 b_skip = x264_macroblock_probe_pskip( h );
2592 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2596 h->mb.i_type = P_SKIP;
2597 h->mb.i_partition = D_16x16;
2598 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2599 /* Set up MVs for future predictors */
2601 for( i = 0; i < h->mb.pic.i_fref[0]; i++ )
2602 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2606 const unsigned int flags = h->param.analyse.inter;
2610 int i_satd_inter, i_satd_intra;
2612 x264_mb_analyse_load_costs( h, &analysis );
2614 x264_mb_analyse_inter_p16x16( h, &analysis );
2616 if( h->mb.i_type == P_SKIP )
2618 for( i = 1; i < h->mb.pic.i_fref[0]; i++ )
2619 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2623 if( flags & X264_ANALYSE_PSUB16x16 )
2625 if( h->param.analyse.b_mixed_references )
2626 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2628 x264_mb_analyse_inter_p8x8( h, &analysis );
2631 /* Select best inter mode */
2633 i_partition = D_16x16;
2634 i_cost = analysis.l0.me16x16.cost;
2636 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2637 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2640 i_partition = D_8x8;
2641 i_cost = analysis.l0.i_cost8x8;
2644 if( flags & X264_ANALYSE_PSUB8x8 )
2646 for( i = 0; i < 4; i++ )
2648 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2649 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2651 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2652 h->mb.i_sub_partition[i] = D_L0_4x4;
2654 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2655 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2656 h->mb.i_sub_partition[i], D_L0_8x4 );
2658 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2659 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2660 h->mb.i_sub_partition[i], D_L0_4x8 );
2662 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2664 x264_mb_cache_mv_p8x8( h, &analysis, i );
2666 analysis.l0.i_cost8x8 = i_cost;
2670 /* Now do 16x8/8x16 */
2671 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2672 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2673 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2675 x264_mb_analyse_inter_p16x8( h, &analysis );
2676 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2678 x264_mb_analyse_inter_p8x16( h, &analysis );
2679 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2682 h->mb.i_partition = i_partition;
2685 //FIXME mb_type costs?
2686 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2690 else if( i_partition == D_16x16 )
2692 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2693 i_cost = analysis.l0.me16x16.cost;
2695 else if( i_partition == D_16x8 )
2697 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2698 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2699 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2701 else if( i_partition == D_8x16 )
2703 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2704 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2705 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2707 else if( i_partition == D_8x8 )
2711 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2713 switch( h->mb.i_sub_partition[i8x8] )
2716 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2717 i_cost += analysis.l0.me8x8[i8x8].cost;
2720 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2721 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2722 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2723 analysis.l0.me8x4[i8x8][1].cost;
2726 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2727 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2728 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2729 analysis.l0.me4x8[i8x8][1].cost;
2733 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2734 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2735 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2736 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2737 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2738 analysis.l0.me4x4[i8x8][1].cost +
2739 analysis.l0.me4x4[i8x8][2].cost +
2740 analysis.l0.me4x4[i8x8][3].cost;
2743 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2749 if( h->mb.b_chroma_me )
2751 x264_mb_analyse_intra_chroma( h, &analysis );
2752 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2753 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2754 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2755 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2758 x264_mb_analyse_intra( h, &analysis, i_cost );
2760 i_satd_inter = i_cost;
2761 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2762 analysis.i_satd_i8x8,
2763 analysis.i_satd_i4x4 );
2765 if( analysis.i_mbrd )
2767 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2769 i_partition = D_16x16;
2770 i_cost = analysis.l0.i_rd16x16;
2771 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2772 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2773 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2774 h->mb.i_type = i_type;
2775 h->mb.i_partition = i_partition;
2776 if( i_cost < COST_MAX )
2777 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2778 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2781 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2782 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2783 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2784 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2786 h->mb.i_type = i_type;
2788 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2790 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2791 * it was an inter block. */
2792 x264_analyse_update_cache( h, &analysis );
2793 x264_macroblock_encode( h );
2794 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2795 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2796 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2797 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2798 goto intra_analysis;
2801 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2803 if( IS_INTRA( h->mb.i_type ) )
2805 x264_intra_rd_refine( h, &analysis );
2807 else if( i_partition == D_16x16 )
2809 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2810 analysis.l0.me16x16.cost = i_cost;
2811 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2813 else if( i_partition == D_16x8 )
2815 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2816 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2817 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2818 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2819 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2820 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2822 else if( i_partition == D_8x16 )
2824 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2825 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2826 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2827 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2828 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2829 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2831 else if( i_partition == D_8x8 )
2834 x264_analyse_update_cache( h, &analysis );
2835 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2837 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2839 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2841 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2843 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2844 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2846 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2848 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2849 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2851 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2853 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2854 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2855 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2856 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2863 else if( h->sh.i_type == SLICE_TYPE_B )
2865 int i_bskip_cost = COST_MAX;
2868 if( analysis.i_mbrd )
2869 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2871 h->mb.i_type = B_SKIP;
2872 if( h->mb.b_direct_auto_write )
2874 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2875 for( i = 0; i < 2; i++ )
2878 h->sh.b_direct_spatial_mv_pred ^= 1;
2879 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2880 if( analysis.b_direct_available )
2885 b_skip = x264_macroblock_probe_bskip( h );
2887 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2894 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2896 analysis.b_try_skip = 0;
2897 if( analysis.b_direct_available )
2899 if( !h->mb.b_direct_auto_write )
2901 if( analysis.i_mbrd )
2903 i_bskip_cost = ssd_mb( h );
2904 /* 6 = minimum cavlc cost of a non-skipped MB */
2905 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2907 else if( !h->mb.b_direct_auto_write )
2909 /* Conditioning the probe on neighboring block types
2910 * doesn't seem to help speed or quality. */
2911 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
2912 if( h->param.analyse.i_subpel_refine < 3 )
2913 b_skip = analysis.b_try_skip;
2915 /* Set up MVs for future predictors */
2918 for( i = 0; i < h->mb.pic.i_fref[0]; i++ )
2919 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2920 for( i = 0; i < h->mb.pic.i_fref[1]; i++ )
2921 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
2927 const unsigned int flags = h->param.analyse.inter;
2931 h->mb.b_skip_mc = 0;
2932 h->mb.i_type = B_DIRECT;
2934 x264_mb_analyse_load_costs( h, &analysis );
2936 /* select best inter mode */
2937 /* direct must be first */
2938 if( analysis.b_direct_available )
2939 x264_mb_analyse_inter_direct( h, &analysis );
2941 x264_mb_analyse_inter_b16x16( h, &analysis );
2943 if( h->mb.i_type == B_SKIP )
2945 for( i = 1; i < h->mb.pic.i_fref[0]; i++ )
2946 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2947 for( i = 1; i < h->mb.pic.i_fref[1]; i++ )
2948 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2953 i_partition = D_16x16;
2954 i_cost = analysis.l0.me16x16.cost;
2955 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2956 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2957 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2959 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2961 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2962 if( i_bskip_cost < analysis.i_rd16x16direct &&
2963 i_bskip_cost < analysis.i_rd16x16bi &&
2964 i_bskip_cost < analysis.l0.i_rd16x16 &&
2965 i_bskip_cost < analysis.l1.i_rd16x16 )
2967 h->mb.i_type = B_SKIP;
2968 x264_analyse_update_cache( h, &analysis );
2973 if( flags & X264_ANALYSE_BSUB16x16 )
2975 if( h->param.analyse.b_mixed_references )
2976 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
2978 x264_mb_analyse_inter_b8x8( h, &analysis );
2980 if( analysis.i_cost8x8bi < i_cost )
2983 i_partition = D_8x8;
2984 i_cost = analysis.i_cost8x8bi;
2986 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2987 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2989 x264_mb_analyse_inter_b16x8( h, &analysis );
2990 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2991 i_type, analysis.i_mb_type16x8,
2992 i_partition, D_16x8 );
2994 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2995 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2997 x264_mb_analyse_inter_b8x16( h, &analysis );
2998 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2999 i_type, analysis.i_mb_type8x16,
3000 i_partition, D_8x16 );
3005 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3010 else if( i_partition == D_16x16 )
3012 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3013 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3014 if( i_type == B_L0_L0 )
3016 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3017 i_cost = analysis.l0.me16x16.cost
3018 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3020 else if( i_type == B_L1_L1 )
3022 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3023 i_cost = analysis.l1.me16x16.cost
3024 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3026 else if( i_type == B_BI_BI )
3028 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3029 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3032 else if( i_partition == D_16x8 )
3034 for( i=0; i<2; i++ )
3036 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3037 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3038 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3039 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3042 else if( i_partition == D_8x16 )
3044 for( i=0; i<2; i++ )
3046 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3047 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3048 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3049 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3052 else if( i_partition == D_8x8 )
3054 for( i=0; i<4; i++ )
3057 int i_part_cost_old;
3059 int i_part_type = h->mb.i_sub_partition[i];
3060 int b_bidir = (i_part_type == D_BI_8x8);
3062 if( i_part_type == D_DIRECT_8x8 )
3064 if( x264_mb_partition_listX_table[0][i_part_type] )
3066 m = &analysis.l0.me8x8[i];
3067 i_part_cost_old = m->cost;
3068 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3069 m->cost -= i_type_cost;
3070 x264_me_refine_qpel( h, m );
3072 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3074 if( x264_mb_partition_listX_table[1][i_part_type] )
3076 m = &analysis.l1.me8x8[i];
3077 i_part_cost_old = m->cost;
3078 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3079 m->cost -= i_type_cost;
3080 x264_me_refine_qpel( h, m );
3082 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3084 /* TODO: update mvp? */
3088 i_satd_inter = i_cost;
3090 if( analysis.i_mbrd )
3092 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3094 i_cost = i_bskip_cost;
3095 i_partition = D_16x16;
3096 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3097 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3098 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3099 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3100 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3101 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3102 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3104 h->mb.i_type = i_type;
3105 h->mb.i_partition = i_partition;
3108 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3110 if( analysis.i_mbrd )
3112 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3113 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3116 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3117 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3118 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3119 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3121 h->mb.i_type = i_type;
3122 h->mb.i_partition = i_partition;
3124 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3125 x264_intra_rd_refine( h, &analysis );
3126 if( h->mb.i_subpel_refine >= 5 )
3127 x264_refine_bidir( h, &analysis );
3129 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3132 x264_analyse_update_cache( h, &analysis );
3134 if( i_partition == D_16x16 )
3136 if( i_type == B_L0_L0 )
3138 analysis.l0.me16x16.cost = i_cost;
3139 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3141 else if( i_type == B_L1_L1 )
3143 analysis.l1.me16x16.cost = i_cost;
3144 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3146 else if( i_type == B_BI_BI )
3148 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3149 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3152 else if( i_partition == D_16x8 )
3154 for( i = 0; i < 2; i++ )
3156 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3157 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3158 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3159 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3160 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3161 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3163 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3164 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3168 else if( i_partition == D_8x16 )
3170 for( i = 0; i < 2; i++ )
3172 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3173 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3174 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3175 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3176 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3177 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3179 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3180 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3184 else if( i_partition == D_8x8 )
3186 for( i = 0; i < 4; i++ )
3188 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3189 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3190 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3191 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3192 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3194 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3195 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3203 x264_analyse_update_cache( h, &analysis );
3205 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3206 * without realizing it. Check for this and account for it if necessary. */
3207 if( analysis.i_mbrd >= 2 )
3209 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3210 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3211 int list = check_mv_lists[h->mb.i_type] - 1;
3212 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3213 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3214 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3215 h->mb.i_partition = D_16x16;
3218 if( !analysis.i_mbrd )
3219 x264_mb_analyse_transform( h );
3221 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3222 x264_mb_analyse_qp_rd( h, &analysis );
3224 h->mb.b_trellis = h->param.analyse.i_trellis;
3225 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3226 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3227 x264_psy_trellis_init( h, 0 );
3228 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3229 h->mb.i_skip_intra = 0;
3232 /*-------------------- Update MB from the analysis ----------------------*/
3233 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3237 switch( h->mb.i_type )
3240 for( i = 0; i < 16; i++ )
3241 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3243 x264_mb_analyse_intra_chroma( h, a );
3246 for( i = 0; i < 4; i++ )
3247 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3249 x264_mb_analyse_intra_chroma( h, a );
3252 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3253 x264_mb_analyse_intra_chroma( h, a );
3260 switch( h->mb.i_partition )
3263 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3264 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3268 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3269 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3270 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3271 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3275 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3276 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3277 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3278 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3282 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3288 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3289 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3290 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3291 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3292 for( i = 0; i < 4; i++ )
3293 x264_mb_cache_mv_p8x8( h, a, i );
3298 h->mb.i_partition = D_16x16;
3299 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3300 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3306 h->mb.i_partition = h->mb.cache.direct_partition;
3307 x264_mb_load_mv_direct8x8( h, 0 );
3308 x264_mb_load_mv_direct8x8( h, 1 );
3309 x264_mb_load_mv_direct8x8( h, 2 );
3310 x264_mb_load_mv_direct8x8( h, 3 );
3314 /* optimize: cache might not need to be rewritten */
3315 for( i = 0; i < 4; i++ )
3316 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3319 default: /* the rest of the B types */
3320 switch( h->mb.i_partition )
3323 switch( h->mb.i_type )
3326 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3327 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3329 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3330 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3331 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3334 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3335 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3336 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3338 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3339 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3342 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3343 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3345 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3346 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3351 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3352 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3355 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3356 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3359 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3365 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3368 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3371 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3374 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3375 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3377 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3378 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3379 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3380 h->mb.cache.mv[l][x264_scan8[15]][0],
3381 h->mb.cache.mv[l][x264_scan8[15]][1] );
3382 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3383 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3384 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3385 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3386 x264_mb_analyse_intra( h, a, COST_MAX );
3387 h->mb.i_type = I_16x16;
3388 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3389 x264_mb_analyse_intra_chroma( h, a );
3396 #include "slicetype.c"