1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[7];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
125 int i_mb_partition16x8[2]; /* mb_partition_e */
126 int i_mb_partition8x16[2];
127 int i_mb_type16x8; /* mb_class_e */
130 int b_direct_available;
132 } x264_mb_analysis_t;
134 /* lambda = pow(2,qp/6-2) */
135 const uint8_t x264_lambda_tab[52] = {
136 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
137 1, 1, 1, 1, /* 8-11 */
138 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
139 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
140 6, 7, 8, 9,10,11,13,14, /* 28-35 */
141 16,18,20,23,25,29,32,36, /* 36-43 */
142 40,45,51,57,64,72,81,91 /* 44-51 */
145 /* lambda2 = pow(lambda,2) * .9 * 256 */
146 const int x264_lambda2_tab[52] = {
147 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
148 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
149 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
150 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
151 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
152 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
153 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
156 const uint8_t x264_exp2_lut[64] = {
157 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
158 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
159 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
160 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
163 const float x264_log2_lut[128] = {
164 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
165 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
166 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
167 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
168 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
169 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
170 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
171 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
172 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
173 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
174 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
175 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
176 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
177 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
178 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
179 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
182 /* Avoid an int/float conversion. */
183 const float x264_log2_lz_lut[32] = {
184 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
187 // should the intra and inter lambdas be different?
188 // I'm just matching the behaviour of deadzone quant.
189 static const int x264_trellis_lambda2_tab[2][52] = {
190 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
191 { 46, 58, 73, 92, 117, 147,
192 185, 233, 294, 370, 466, 587,
193 740, 932, 1174, 1480, 1864, 2349,
194 2959, 3728, 4697, 5918, 7457, 9395,
195 11837, 14914, 18790, 23674, 29828, 37581,
196 47349, 59656, 75163, 94699, 119313, 150326,
197 189399, 238627, 300652, 378798, 477255, 601304,
198 757596, 954511, 1202608, 1515192, 1909022, 2405217,
199 3030384, 3818045, 4810435, 6060769 },
200 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
201 { 27, 34, 43, 54, 68, 86,
202 108, 136, 172, 216, 273, 343,
203 433, 545, 687, 865, 1090, 1374,
204 1731, 2180, 2747, 3461, 4361, 5494,
205 6922, 8721, 10988, 13844, 17442, 21976,
206 27688, 34885, 43953, 55377, 69771, 87906,
207 110755, 139543, 175813, 221511, 279087, 351627,
208 443023, 558174, 703255, 886046, 1116348, 1406511,
209 1772093, 2232697, 2813022, 3544186 }
212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
213 16, 20, 25, 32, 40, 50,
214 64, 80, 101, 128, 161, 203,
215 256, 322, 406, 512, 645, 812,
216 1024, 1290, 1625, 2048, 2580, 3250,
217 4096, 5160, 6501, 8192, 10321, 13003,
218 16384, 20642, 26007, 32768, 41285, 52015,
222 /* TODO: calculate CABAC costs */
223 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
224 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
226 static const uint8_t i_mb_b16x8_cost_table[17] = {
227 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
229 static const uint8_t i_sub_mb_b_cost_table[13] = {
230 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
232 static const uint8_t i_sub_mb_p_cost_table[4] = {
236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
238 static uint16_t x264_cost_ref[92][3][33];
239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
241 int x264_analyse_init_costs( x264_t *h, int qp )
244 int lambda = x264_lambda_tab[qp];
245 if( h->cost_mv[lambda] )
247 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
248 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
249 h->cost_mv[lambda] += 2*4*2048;
250 for( i = 0; i <= 2*4*2048; i++ )
252 h->cost_mv[lambda][-i] =
253 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
255 x264_pthread_mutex_lock( &cost_ref_mutex );
256 for( i = 0; i < 3; i++ )
257 for( j = 0; j < 33; j++ )
258 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
259 x264_pthread_mutex_unlock( &cost_ref_mutex );
260 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
264 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
265 h->cost_mv_fpel[lambda][j] += 2*2048;
266 for( i = -2*2048; i < 2*2048; i++ )
267 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
275 void x264_analyse_free_costs( x264_t *h )
278 for( i = 0; i < 92; i++ )
281 x264_free( h->cost_mv[i] - 2*4*2048 );
282 if( h->cost_mv_fpel[i][0] )
283 for( j = 0; j < 4; j++ )
284 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
288 void x264_analyse_weight_frame( x264_t *h, int end )
291 for( j=0; j<h->i_ref0; j++ )
293 if( h->sh.weight[j][0].weightfn )
295 x264_frame_t *frame = h->fref0[j];
296 int width = frame->i_width[0] + 2*PADH;
297 int i_padv = PADV << h->param.b_interlaced;
299 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
301 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
302 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
303 h->fenc->i_lines_weighted += height;
306 for( k = j; k < h->i_ref0; k++ )
307 if( h->sh.weight[k][0].weightfn )
309 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
310 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
311 src + offset, frame->i_stride[0],
312 width, height, &h->sh.weight[k][0] );
320 /* initialize an array of lambda*nbits for all possible mvs */
321 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
323 a->p_cost_mv = h->cost_mv[a->i_lambda];
324 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
325 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
328 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
330 /* conduct the analysis using this lamda and QP */
331 a->i_qp = h->mb.i_qp = i_qp;
332 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
334 a->i_lambda = x264_lambda_tab[i_qp];
335 a->i_lambda2 = x264_lambda2_tab[i_qp];
337 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
338 if( h->param.analyse.i_trellis )
340 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
341 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
342 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
343 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
345 h->mb.i_psy_rd_lambda = a->i_lambda;
346 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
347 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
350 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
352 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
354 /* mbrd == 1 -> RD mode decision */
355 /* mbrd == 2 -> RD refinement */
356 /* mbrd == 3 -> QPRD */
357 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
359 x264_mb_analyse_init_qp( h, a, i_qp );
361 h->mb.b_transform_8x8 = 0;
362 h->mb.b_noise_reduction = 0;
368 a->i_satd_i8x8chroma = COST_MAX;
370 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
371 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
375 h->mb.b_lossless ? 0 :
377 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
379 /* II: Inter part P/B frame */
380 if( h->sh.i_type != SLICE_TYPE_I )
383 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
384 // limit motion search to a slightly smaller range than the theoretical limit,
385 // since the search may go a few iterations past its given range
386 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
388 /* Calculate max allowed MV range */
389 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
390 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
391 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
392 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
393 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
394 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
396 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
397 int max_mv = max_x - 4*16*h->mb.i_mb_x;
398 /* If we're left of the refresh bar, don't reference right of it. */
399 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
400 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
402 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
403 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
404 if( h->mb.i_mb_x == 0 )
406 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
407 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
408 int thread_mvy_range = i_fmv_range;
410 if( h->i_thread_frames > 1 )
412 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
413 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
414 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
416 x264_frame_t **fref = i ? h->fref1 : h->fref0;
417 int i_ref = i ? h->i_ref1 : h->i_ref0;
418 for( j=0; j<i_ref; j++ )
420 x264_frame_cond_wait( fref[j]->orig, thresh );
421 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
425 if( h->param.b_deterministic )
426 thread_mvy_range = h->param.analyse.i_mv_range_thread;
427 if( h->mb.b_interlaced )
428 thread_mvy_range >>= 1;
430 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
433 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
434 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
435 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
436 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
437 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
438 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
439 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
447 a->l0.i_cost8x16 = COST_MAX;
448 if( h->sh.i_type == SLICE_TYPE_B )
453 a->i_cost8x8direct[0] =
454 a->i_cost8x8direct[1] =
455 a->i_cost8x8direct[2] =
456 a->i_cost8x8direct[3] =
465 a->i_cost16x16direct =
468 a->i_cost8x16bi = COST_MAX;
470 else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
471 for( i = 0; i < 4; i++ )
475 a->l0.i_cost4x8[i] = COST_MAX;
478 /* Fast intra decision */
479 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
481 if( IS_INTRA( h->mb.i_mb_type_left )
482 || IS_INTRA( h->mb.i_mb_type_top )
483 || IS_INTRA( h->mb.i_mb_type_topleft )
484 || IS_INTRA( h->mb.i_mb_type_topright )
485 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
486 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
487 { /* intra is likely */ }
494 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
495 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
497 a->b_force_intra = 1;
501 a->b_force_intra = 0;
505 /* Prediction modes allowed for various combinations of neighbors. */
506 /* Terminated by a -1. */
507 /* In order, no neighbors, left, top, top/left, top/left/topleft */
508 static const int8_t i16x16_mode_available[5][5] =
510 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
511 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
512 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
513 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
514 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
517 static const int8_t i8x8chroma_mode_available[5][5] =
519 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
520 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
521 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
522 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
523 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
526 static const int8_t i4x4_mode_available[5][10] =
528 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
529 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
530 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
531 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
532 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
535 static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
537 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
538 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
541 static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
543 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
544 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
547 static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour )
549 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
550 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
553 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
554 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
556 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
558 if( do_both_dct || h->mb.b_transform_8x8 )
559 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
560 if( do_both_dct || !h->mb.b_transform_8x8 )
561 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
564 /* Reset fenc satd scores cache for psy RD */
565 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
567 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
568 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
569 if( !h->mb.i_psy_rd )
571 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
572 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
574 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
577 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
579 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
581 if( a->i_satd_i8x8chroma < COST_MAX )
584 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
586 /* 8x8 prediction selection for chroma */
587 if( predict_mode[3] >= 0 && b_merged_satd )
589 int satdu[4], satdv[4];
590 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
591 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
592 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
593 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
594 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
595 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
597 for( ; *predict_mode >= 0; predict_mode++ )
599 int i_mode = *predict_mode;
600 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
602 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
603 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
608 for( ; *predict_mode >= 0; predict_mode++ )
611 int i_mode = *predict_mode;
613 /* we do the prediction */
614 if( h->mb.b_lossless )
615 x264_predict_lossless_8x8_chroma( h, i_mode );
618 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
619 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
622 /* we calculate the cost */
623 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
624 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
625 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
627 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
628 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
632 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
635 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
637 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
638 uint8_t *p_src = h->mb.pic.p_fenc[0];
639 uint8_t *p_dst = h->mb.pic.p_fdec[0];
642 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
644 /*---------------- Try all mode and calculate their score ---------------*/
646 /* 16x16 prediction selection */
647 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
649 if( b_merged_satd && predict_mode[3] >= 0 )
651 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
652 h->predict_16x16[I_PRED_16x16_P]( p_dst );
653 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
654 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
657 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
658 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
663 for( ; *predict_mode >= 0; predict_mode++ )
666 int i_mode = *predict_mode;
668 if( h->mb.b_lossless )
669 x264_predict_lossless_16x16( h, i_mode );
671 h->predict_16x16[i_mode]( p_dst );
673 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
674 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
675 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
676 a->i_satd_i16x16_dir[i_mode] = i_satd;
680 if( h->sh.i_type == SLICE_TYPE_B )
681 /* cavlc mb type prefix */
682 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
683 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
686 /* 8x8 prediction selection */
687 if( flags & X264_ANALYSE_I8x8 )
689 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
690 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
691 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
693 // FIXME some bias like in i4x4?
694 int i_cost = a->i_lambda * 4; /* base predmode costs */
695 h->mb.i_cbp_luma = 0;
696 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
698 if( h->sh.i_type == SLICE_TYPE_B )
699 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
701 for( idx = 0;; idx++ )
705 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
706 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
707 int i_best = COST_MAX;
708 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
710 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
711 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
713 if( b_merged_satd && predict_mode[8] >= 0 )
716 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
717 satd[i_pred_mode] -= 3 * a->i_lambda;
718 for( i=2; i>=0; i-- )
720 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i];
721 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
726 for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
729 int i_mode = *predict_mode;
731 if( h->mb.b_lossless )
732 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
734 h->predict_8x8[i_mode]( p_dst_by, edge );
736 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
737 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
738 i_satd -= 3 * a->i_lambda;
740 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
741 a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * a->i_lambda;
743 i_cost += i_best + 3 * a->i_lambda;
745 if( idx == 3 || i_cost > i_satd_thresh )
748 /* we need to encode this block now (for next ones) */
749 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
750 x264_mb_encode_i8x8( h, idx, a->i_qp );
752 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
757 a->i_satd_i8x8 = i_cost;
758 if( h->mb.i_skip_intra )
760 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
761 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
762 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
763 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
764 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
765 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
766 if( h->mb.i_skip_intra == 2 )
767 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
772 static const uint16_t cost_div_fix8[3] = {1024,512,341};
773 a->i_satd_i8x8 = COST_MAX;
774 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
776 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
780 /* 4x4 prediction selection */
781 if( flags & X264_ANALYSE_I4x4 )
783 int i_cost = a->i_lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
784 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
785 h->mb.i_cbp_luma = 0;
786 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
788 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
790 if( h->sh.i_type == SLICE_TYPE_B )
791 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
793 for( idx = 0;; idx++ )
795 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
796 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
797 int i_best = COST_MAX;
798 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
800 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
802 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
803 /* emulate missing topright samples */
804 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
806 if( b_merged_satd && predict_mode[5] >= 0 )
809 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
810 satd[i_pred_mode] -= 3 * a->i_lambda;
811 for( i=2; i>=0; i-- )
812 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
818 for( ; *predict_mode >= 0; predict_mode++ )
821 int i_mode = *predict_mode;
823 if( h->mb.b_lossless )
824 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
826 h->predict_4x4[i_mode]( p_dst_by );
828 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
829 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
831 i_satd -= a->i_lambda * 3;
835 a->i_predict4x4[idx] = i_mode;
840 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
843 i_cost += i_best + 3 * a->i_lambda;
845 if( i_cost > i_satd_thresh || idx == 15 )
848 /* we need to encode this block now (for next ones) */
849 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
850 x264_mb_encode_i4x4( h, idx, a->i_qp );
852 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
856 a->i_satd_i4x4 = i_cost;
857 if( h->mb.i_skip_intra )
859 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
860 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
861 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
862 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
863 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
864 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
865 if( h->mb.i_skip_intra == 2 )
866 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
870 a->i_satd_i4x4 = COST_MAX;
874 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
876 if( a->i_satd_i16x16 <= i_satd_thresh )
878 h->mb.i_type = I_16x16;
879 x264_analyse_update_cache( h, a );
880 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
883 a->i_satd_i16x16 = COST_MAX;
885 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
887 h->mb.i_type = I_4x4;
888 x264_analyse_update_cache( h, a );
889 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
892 a->i_satd_i4x4 = COST_MAX;
894 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
896 h->mb.i_type = I_8x8;
897 x264_analyse_update_cache( h, a );
898 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
899 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
902 a->i_satd_i8x8 = COST_MAX;
905 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
907 uint8_t *p_dst = h->mb.pic.p_fdec[0];
910 int i_mode, i_thresh;
911 uint64_t i_satd, i_best;
912 h->mb.i_skip_intra = 0;
914 if( h->mb.i_type == I_16x16 )
916 int old_pred_mode = a->i_predict16x16;
917 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
918 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
919 i_best = a->i_satd_i16x16;
920 for( ; *predict_mode >= 0; predict_mode++ )
922 int i_mode = *predict_mode;
923 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
925 h->mb.i_intra16x16_pred_mode = i_mode;
926 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
927 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
931 /* RD selection for chroma prediction */
932 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
933 if( predict_mode[1] >= 0 )
935 int8_t predict_mode_sorted[4];
937 i_thresh = a->i_satd_i8x8chroma * 5/4;
939 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
941 i_mode = *predict_mode;
942 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
943 predict_mode_sorted[i_max++] = i_mode;
948 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
949 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
950 /* the previous thing encoded was x264_intra_rd(), so the pixels and
951 * coefs for the current chroma mode are still around, so we only
952 * have to recount the bits. */
953 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
954 for( i = 0; i < i_max; i++ )
956 i_mode = predict_mode_sorted[i];
957 if( h->mb.b_lossless )
958 x264_predict_lossless_8x8_chroma( h, i_mode );
961 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
962 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
964 /* if we've already found a mode that needs no residual, then
965 * probably any mode with a residual will be worse.
966 * so avoid dct on the remaining modes to improve speed. */
967 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
968 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
970 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
971 h->mb.i_cbp_chroma = i_cbp_chroma_best;
975 if( h->mb.i_type == I_4x4 )
977 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
979 for( idx = 0; idx < 16; idx++ )
981 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
984 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
986 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
987 /* emulate missing topright samples */
988 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
990 for( ; *predict_mode >= 0; predict_mode++ )
992 i_mode = *predict_mode;
993 if( h->mb.b_lossless )
994 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
996 h->predict_4x4[i_mode]( p_dst_by );
997 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
999 if( i_best > i_satd )
1001 a->i_predict4x4[idx] = i_mode;
1003 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1004 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1005 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1006 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1007 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1011 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1012 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1013 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1014 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1015 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1017 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1020 else if( h->mb.i_type == I_8x8 )
1022 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1023 for( idx = 0; idx < 4; idx++ )
1025 uint64_t pels_h = 0;
1027 uint16_t i_nnz[2] = {0}; //shut up gcc
1030 int cbp_luma_new = 0;
1031 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1033 i_best = COST_MAX64;
1037 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1038 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1039 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1041 for( ; *predict_mode >= 0; predict_mode++ )
1043 i_mode = *predict_mode;
1044 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1047 if( h->mb.b_lossless )
1048 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1050 h->predict_8x8[i_mode]( p_dst_by, edge );
1051 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1052 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1054 if( i_best > i_satd )
1056 a->i_predict8x8[idx] = i_mode;
1057 cbp_luma_new = h->mb.i_cbp_luma;
1060 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1062 for( j=0; j<7; j++ )
1063 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1064 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1065 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1068 a->i_cbp_i8x8_luma = cbp_luma_new;
1069 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1071 for( j=0; j<7; j++ )
1072 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1073 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1074 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1076 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1081 #define LOAD_FENC( m, src, xoff, yoff) \
1082 (m)->p_cost_mv = a->p_cost_mv; \
1083 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1084 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1085 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1086 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1087 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1089 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1090 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1091 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1092 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1093 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1094 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1095 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1096 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1097 (m)->weight = weight_none; \
1100 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1101 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1102 (m)->weight = h->sh.weight[i_ref];
1104 #define REF_COST(list, ref) \
1105 (a->p_cost_ref[list][ref])
1107 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1111 ALIGNED_4( int16_t mvc[8][2] );
1112 int i_halfpel_thresh = INT_MAX;
1113 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1115 /* 16x16 Search on all ref frame */
1116 m.i_pixel = PIXEL_16x16;
1117 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1119 a->l0.me16x16.cost = INT_MAX;
1120 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1122 m.i_ref_cost = REF_COST( 0, i_ref );
1123 i_halfpel_thresh -= m.i_ref_cost;
1125 /* search with ref */
1126 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1127 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1129 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1130 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1132 if( h->mb.ref_blind_dupe == i_ref )
1134 CP32( m.mv, a->l0.mvc[0][0] );
1135 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1138 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1140 /* save mv for predicting neighbors */
1141 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1142 CP32( a->l0.mvc[i_ref][0], m.mv );
1144 /* early termination
1145 * SSD threshold would probably be better than SATD */
1148 && m.cost-m.cost_mv < 300*a->i_lambda
1149 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1150 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1151 && x264_macroblock_probe_pskip( h ) )
1153 h->mb.i_type = P_SKIP;
1154 x264_analyse_update_cache( h, a );
1155 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1159 m.cost += m.i_ref_cost;
1160 i_halfpel_thresh += m.i_ref_cost;
1162 if( m.cost < a->l0.me16x16.cost )
1163 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1166 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1167 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1169 h->mb.i_type = P_L0;
1172 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1173 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1175 h->mb.i_partition = D_16x16;
1176 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1177 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1178 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1179 h->mb.i_type = P_SKIP;
1184 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1188 uint8_t **p_fenc = h->mb.pic.p_fenc;
1189 int i_maxref = h->mb.pic.i_fref[0]-1;
1191 h->mb.i_partition = D_8x8;
1193 #define CHECK_NEIGHBOUR(i)\
1195 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1196 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1200 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1201 * than those used by the neighbors */
1202 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1203 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1206 CHECK_NEIGHBOUR( -8 - 1 );
1207 CHECK_NEIGHBOUR( -8 + 0 );
1208 CHECK_NEIGHBOUR( -8 + 2 );
1209 CHECK_NEIGHBOUR( -8 + 4 );
1210 CHECK_NEIGHBOUR( 0 - 1 );
1211 CHECK_NEIGHBOUR( 2*8 - 1 );
1213 #undef CHECK_NEIGHBOUR
1215 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1216 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1218 for( i = 0; i < 4; i++ )
1220 x264_me_t *l0m = &a->l0.me8x8[i];
1224 m.i_pixel = PIXEL_8x8;
1226 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1227 l0m->cost = INT_MAX;
1228 for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1230 m.i_ref_cost = REF_COST( 0, i_ref );
1232 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1233 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1235 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1236 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1237 if( h->mb.ref_blind_dupe == i_ref )
1239 CP32( m.mv, a->l0.mvc[0][i+1] );
1240 x264_me_refine_qpel_refdupe( h, &m, NULL );
1243 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1245 m.cost += m.i_ref_cost;
1247 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1249 if( m.cost < l0m->cost )
1250 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1251 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1252 i_ref = h->mb.ref_blind_dupe;
1256 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1257 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1259 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1260 are effectively zero. */
1261 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1262 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1265 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1266 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1267 /* P_8x8 ref0 has no ref cost */
1268 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1269 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1270 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1271 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1272 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1275 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1277 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1278 * reference frame flags. Thus, if we're not doing mixedrefs, just
1279 * don't bother analysing the dupes. */
1280 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1281 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1282 uint8_t **p_fenc = h->mb.pic.p_fenc;
1284 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1287 /* XXX Needed for x264_mb_predict_mv */
1288 h->mb.i_partition = D_8x8;
1291 CP32( mvc[0], a->l0.me16x16.mv );
1293 for( i = 0; i < 4; i++ )
1295 x264_me_t *m = &a->l0.me8x8[i];
1299 m->i_pixel = PIXEL_8x8;
1300 m->i_ref_cost = i_ref_cost;
1302 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1303 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1304 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1306 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1307 x264_me_search( h, m, mvc, i_mvc );
1309 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1311 CP32( mvc[i_mvc], m->mv );
1315 m->cost += i_ref_cost;
1316 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1317 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1320 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1321 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1322 /* theoretically this should include 4*ref_cost,
1323 * but 3 seems a better approximation of cabac. */
1324 if( h->param.b_cabac )
1325 a->l0.i_cost8x8 -= i_ref_cost;
1326 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1327 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1330 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1333 uint8_t **p_fenc = h->mb.pic.p_fenc;
1334 ALIGNED_4( int16_t mvc[3][2] );
1337 /* XXX Needed for x264_mb_predict_mv */
1338 h->mb.i_partition = D_16x8;
1340 for( i = 0; i < 2; i++ )
1342 x264_me_t *l0m = &a->l0.me16x8[i];
1343 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1344 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1345 const int ref8[2] = { minref, maxref };
1346 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1348 m.i_pixel = PIXEL_16x8;
1350 LOAD_FENC( &m, p_fenc, 0, 8*i );
1351 l0m->cost = INT_MAX;
1352 for( j = 0; j < i_ref8s; j++ )
1354 const int i_ref = ref8[j];
1355 m.i_ref_cost = REF_COST( 0, i_ref );
1357 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1358 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1359 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1360 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1362 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1363 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1365 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1366 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1367 /* We can only take this shortcut if the first search was performed on ref0. */
1368 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1370 /* We can just leave the MV from the previous ref search. */
1371 x264_me_refine_qpel_refdupe( h, &m, NULL );
1374 x264_me_search( h, &m, mvc, 3 );
1376 m.cost += m.i_ref_cost;
1378 if( m.cost < l0m->cost )
1379 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1381 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1382 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1385 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1388 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1391 uint8_t **p_fenc = h->mb.pic.p_fenc;
1392 ALIGNED_4( int16_t mvc[3][2] );
1395 /* XXX Needed for x264_mb_predict_mv */
1396 h->mb.i_partition = D_8x16;
1398 for( i = 0; i < 2; i++ )
1400 x264_me_t *l0m = &a->l0.me8x16[i];
1401 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1402 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1403 const int ref8[2] = { minref, maxref };
1404 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1406 m.i_pixel = PIXEL_8x16;
1408 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1409 l0m->cost = INT_MAX;
1410 for( j = 0; j < i_ref8s; j++ )
1412 const int i_ref = ref8[j];
1413 m.i_ref_cost = REF_COST( 0, i_ref );
1415 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1416 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1417 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1419 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1420 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1422 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1423 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1424 /* We can only take this shortcut if the first search was performed on ref0. */
1425 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1427 /* We can just leave the MV from the previous ref search. */
1428 x264_me_refine_qpel_refdupe( h, &m, NULL );
1431 x264_me_search( h, &m, mvc, 3 );
1433 m.cost += m.i_ref_cost;
1435 if( m.cost < l0m->cost )
1436 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1438 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1439 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1442 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1445 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1447 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1448 uint8_t *pix2 = pix1+8;
1449 const int i_stride = h->mb.pic.i_stride[1];
1450 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1451 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1452 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1453 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1454 x264_weight_t *weight = h->sh.weight[i_ref];
1456 #define CHROMA4x4MC( width, height, me, x, y ) \
1457 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1458 if( weight[1].weightfn ) \
1459 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1460 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1461 if( weight[2].weightfn ) \
1462 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1465 if( pixel == PIXEL_4x4 )
1467 x264_me_t *m = a->l0.me4x4[i8x8];
1468 CHROMA4x4MC( 2,2, m[0], 0,0 );
1469 CHROMA4x4MC( 2,2, m[1], 2,0 );
1470 CHROMA4x4MC( 2,2, m[2], 0,2 );
1471 CHROMA4x4MC( 2,2, m[3], 2,2 );
1473 else if( pixel == PIXEL_8x4 )
1475 x264_me_t *m = a->l0.me8x4[i8x8];
1476 CHROMA4x4MC( 4,2, m[0], 0,0 );
1477 CHROMA4x4MC( 4,2, m[1], 0,2 );
1481 x264_me_t *m = a->l0.me4x8[i8x8];
1482 CHROMA4x4MC( 2,4, m[0], 0,0 );
1483 CHROMA4x4MC( 2,4, m[1], 2,0 );
1486 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1487 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1490 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1492 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1493 uint8_t **p_fenc = h->mb.pic.p_fenc;
1494 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1497 /* XXX Needed for x264_mb_predict_mv */
1498 h->mb.i_partition = D_8x8;
1500 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1502 const int idx = 4*i8x8 + i4x4;
1503 const int x4 = block_idx_x[idx];
1504 const int y4 = block_idx_y[idx];
1505 const int i_mvc = (i4x4 == 0);
1507 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1509 m->i_pixel = PIXEL_4x4;
1511 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1512 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1513 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1515 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1516 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1518 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1520 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1521 a->l0.me4x4[i8x8][1].cost +
1522 a->l0.me4x4[i8x8][2].cost +
1523 a->l0.me4x4[i8x8][3].cost +
1524 REF_COST( 0, i_ref ) +
1525 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1526 if( h->mb.b_chroma_me )
1527 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1530 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1532 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1533 uint8_t **p_fenc = h->mb.pic.p_fenc;
1534 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1537 /* XXX Needed for x264_mb_predict_mv */
1538 h->mb.i_partition = D_8x8;
1540 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1542 const int idx = 4*i8x8 + 2*i8x4;
1543 const int x4 = block_idx_x[idx];
1544 const int y4 = block_idx_y[idx];
1545 const int i_mvc = (i8x4 == 0);
1547 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1549 m->i_pixel = PIXEL_8x4;
1551 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1552 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1553 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1555 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1556 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1558 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1560 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1561 REF_COST( 0, i_ref ) +
1562 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1563 if( h->mb.b_chroma_me )
1564 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1567 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1569 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1570 uint8_t **p_fenc = h->mb.pic.p_fenc;
1571 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1574 /* XXX Needed for x264_mb_predict_mv */
1575 h->mb.i_partition = D_8x8;
1577 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1579 const int idx = 4*i8x8 + i4x8;
1580 const int x4 = block_idx_x[idx];
1581 const int y4 = block_idx_y[idx];
1582 const int i_mvc = (i4x8 == 0);
1584 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1586 m->i_pixel = PIXEL_4x8;
1588 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1589 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1590 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1592 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1593 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1595 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1597 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1598 REF_COST( 0, i_ref ) +
1599 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1600 if( h->mb.b_chroma_me )
1601 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1604 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1606 /* Assumes that fdec still contains the results of
1607 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1609 uint8_t **p_fenc = h->mb.pic.p_fenc;
1610 uint8_t **p_fdec = h->mb.pic.p_fdec;
1613 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1614 for( i = 0; i < 4; i++ )
1616 const int x = (i&1)*8;
1617 const int y = (i>>1)*8;
1618 a->i_cost16x16direct +=
1619 a->i_cost8x8direct[i] =
1620 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1623 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1627 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1629 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1630 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1631 uint8_t *src0, *src1;
1632 int stride0 = 16, stride1 = 16;
1633 int i_ref, i_mvc, l;
1634 ALIGNED_4( int16_t mvc[9][2] );
1635 int try_skip = a->b_try_skip;
1636 int list1_skipped = 0;
1637 int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
1638 int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL,
1639 h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL};
1642 m.i_pixel = PIXEL_16x16;
1644 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1646 /* 16x16 Search on list 0 and list 1 */
1647 a->l0.me16x16.cost = INT_MAX;
1648 a->l1.me16x16.cost = INT_MAX;
1649 for( l = 1; l >= 0; )
1651 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1653 /* This loop is extremely munged in order to facilitate the following order of operations,
1654 * necessary for an efficient fast skip.
1655 * 1. Search list1 ref0.
1656 * 2. Search list0 ref0.
1658 * 4. Search the rest of list0.
1659 * 5. Go back and finish list1.
1661 for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1663 if( try_skip && l == 1 && i_ref > 0 )
1669 m.i_ref_cost = REF_COST( l, i_ref );
1671 /* search with ref */
1672 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1673 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1674 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1675 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
1678 m.cost += m.i_ref_cost;
1680 if( m.cost < lX->me16x16.cost )
1681 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1683 /* save mv for predicting neighbors */
1684 CP32( lX->mvc[i_ref][0], m.mv );
1685 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1687 /* Fast skip detection. */
1688 if( i_ref == 0 && try_skip )
1690 if( abs(lX->bi16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
1691 abs(lX->bi16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
1697 /* We already tested skip */
1698 h->mb.i_type = B_SKIP;
1699 x264_analyse_update_cache( h, a );
1704 if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
1706 if( list1_skipped && l == 0 )
1712 /* get cost of BI mode */
1713 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1714 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1715 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1716 src0 = h->mc.get_ref( pix0, &stride0,
1717 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1718 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1719 src1 = h->mc.get_ref( pix1, &stride1,
1720 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1721 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1723 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1725 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1727 + a->l0.bi16x16.cost_mv
1728 + a->l1.bi16x16.cost_mv;
1730 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1731 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1733 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1734 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1735 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1736 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1737 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1738 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1739 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1740 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1741 + ref_costs + l0_mv_cost + l1_mv_cost;
1742 if( cost00 < a->i_cost16x16bi )
1744 M32( a->l0.bi16x16.mv ) = 0;
1745 M32( a->l1.bi16x16.mv ) = 0;
1746 a->l0.bi16x16.cost_mv = l0_mv_cost;
1747 a->l1.bi16x16.cost_mv = l1_mv_cost;
1748 a->i_cost16x16bi = cost00;
1753 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1754 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1755 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1758 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1760 const int x = 2*(i%2);
1761 const int y = 2*(i/2);
1763 switch( h->mb.i_sub_partition[i] )
1766 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1769 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1770 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1773 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1774 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1777 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1778 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1779 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1780 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1783 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1788 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1790 const int x = 2*(idx&1);
1791 const int y = 2*(idx>>1);
1792 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1793 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1794 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1795 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1798 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1799 if( x264_mb_partition_listX_table[0][part] ) \
1801 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1802 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1806 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1807 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1809 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1811 if( x264_mb_partition_listX_table[1][part] ) \
1813 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1814 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1818 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1819 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1821 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1824 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1828 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1830 x264_mb_load_mv_direct8x8( h, i );
1833 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1834 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1835 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1840 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1843 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1845 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1847 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1849 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1853 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1855 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1857 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1859 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1860 * than those used by the neighbors */
1861 #define CHECK_NEIGHBOUR(i)\
1863 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1864 if( ref > i_maxref[l] )\
1868 for( l = 0; l < 2; l++ )
1870 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1871 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1872 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1875 CHECK_NEIGHBOUR( -8 - 1 );
1876 CHECK_NEIGHBOUR( -8 + 0 );
1877 CHECK_NEIGHBOUR( -8 + 2 );
1878 CHECK_NEIGHBOUR( -8 + 4 );
1879 CHECK_NEIGHBOUR( 0 - 1 );
1880 CHECK_NEIGHBOUR( 2*8 - 1 );
1884 /* XXX Needed for x264_mb_predict_mv */
1885 h->mb.i_partition = D_8x8;
1889 for( i = 0; i < 4; i++ )
1895 int stride[2] = {8,8};
1898 m.i_pixel = PIXEL_8x8;
1899 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1901 for( l = 0; l < 2; l++ )
1903 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1905 lX->me8x8[i].cost = INT_MAX;
1906 for( i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1908 m.i_ref_cost = REF_COST( l, i_ref );;
1910 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1912 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1913 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1914 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1915 m.cost += m.i_ref_cost;
1917 if( m.cost < lX->me8x8[i].cost )
1918 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1920 /* save mv for predicting other partitions within this MB */
1921 CP32( lX->mvc[i_ref][i+1], m.mv );
1926 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1927 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1928 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1929 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1930 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1931 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1933 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1934 + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv + a->l0.me8x8[i].i_ref_cost
1935 + a->l1.me8x8[i].i_ref_cost + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1937 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1938 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1940 i_part_cost = a->l0.me8x8[i].cost;
1941 h->mb.i_sub_partition[i] = D_L0_8x8;
1942 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1943 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1944 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1945 a->i_cost8x8bi += i_part_cost;
1947 /* XXX Needed for x264_mb_predict_mv */
1948 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1952 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1955 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1957 uint8_t **p_fref[2] =
1958 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
1959 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
1960 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1963 /* XXX Needed for x264_mb_predict_mv */
1964 h->mb.i_partition = D_8x8;
1968 for( i = 0; i < 4; i++ )
1973 int i_part_cost_bi = 0;
1974 int stride[2] = {8,8};
1977 for( l = 0; l < 2; l++ )
1979 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1980 x264_me_t *m = &lX->me8x8[i];
1981 m->i_pixel = PIXEL_8x8;
1982 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1984 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
1985 m->i_ref = lX->me16x16.i_ref;
1987 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
1989 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
1990 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1991 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1992 m->cost += m->i_ref_cost;
1994 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1996 /* save mv for predicting other partitions within this MB */
1997 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
2000 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2001 m->mv[0], m->mv[1], 8, 8, weight_none );
2002 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
2004 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
2005 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2006 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
2007 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2008 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2010 i_part_cost = a->l0.me8x8[i].cost;
2011 h->mb.i_sub_partition[i] = D_L0_8x8;
2012 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
2013 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
2014 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
2015 a->i_cost8x8bi += i_part_cost;
2017 /* XXX Needed for x264_mb_predict_mv */
2018 x264_mb_cache_mv_b8x8( h, a, i, 0 );
2022 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
2025 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
2027 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
2028 ALIGNED_4( int16_t mvc[3][2] );
2031 h->mb.i_partition = D_16x8;
2032 a->i_cost16x8bi = 0;
2034 for( i = 0; i < 2; i++ )
2037 int i_part_cost_bi = 0;
2038 int stride[2] = {16,16};
2041 m.i_pixel = PIXEL_16x8;
2042 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2044 for( l = 0; l < 2; l++ )
2046 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2047 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2048 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2049 lX->me16x8[i].cost = INT_MAX;
2050 for( j = 0; j < i_ref8s; j++ )
2053 m.i_ref_cost = REF_COST( l, i_ref );;
2055 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2057 CP32( mvc[0], lX->mvc[i_ref][0] );
2058 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2059 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2061 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2062 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2063 x264_me_search( h, &m, mvc, 3 );
2064 m.cost += m.i_ref_cost;
2066 if( m.cost < lX->me16x8[i].cost )
2067 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2072 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2073 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2074 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2075 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2076 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2077 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2079 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2080 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2081 + a->l1.me16x8[i].i_ref_cost;
2083 i_part_cost = a->l0.me16x8[i].cost;
2084 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2086 if( a->l1.me16x8[i].cost < i_part_cost )
2088 i_part_cost = a->l1.me16x8[i].cost;
2089 a->i_mb_partition16x8[i] = D_L1_8x8;
2091 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2093 i_part_cost = i_part_cost_bi;
2094 a->i_mb_partition16x8[i] = D_BI_8x8;
2096 a->i_cost16x8bi += i_part_cost;
2098 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2102 a->i_mb_type16x8 = B_L0_L0
2103 + (a->i_mb_partition16x8[0]>>2) * 3
2104 + (a->i_mb_partition16x8[1]>>2);
2105 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2108 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2110 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2111 ALIGNED_4( int16_t mvc[2][2] );
2114 h->mb.i_partition = D_8x16;
2115 a->i_cost8x16bi = 0;
2117 for( i = 0; i < 2; i++ )
2120 int i_part_cost_bi = 0;
2121 int stride[2] = {8,8};
2124 m.i_pixel = PIXEL_8x16;
2125 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2127 for( l = 0; l < 2; l++ )
2129 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2130 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2131 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2132 lX->me8x16[i].cost = INT_MAX;
2133 for( j = 0; j < i_ref8s; j++ )
2136 m.i_ref_cost = REF_COST( l, i_ref );
2138 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2140 CP32( mvc[0], lX->mvc[i_ref][0] );
2141 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2142 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2144 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2145 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2146 x264_me_search( h, &m, mvc, 3 );
2147 m.cost += m.i_ref_cost;
2149 if( m.cost < lX->me8x16[i].cost )
2150 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2155 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2156 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2157 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2158 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2159 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2161 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2162 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2163 + a->l1.me8x16[i].i_ref_cost;
2165 i_part_cost = a->l0.me8x16[i].cost;
2166 a->i_mb_partition8x16[i] = D_L0_8x8;
2168 if( a->l1.me8x16[i].cost < i_part_cost )
2170 i_part_cost = a->l1.me8x16[i].cost;
2171 a->i_mb_partition8x16[i] = D_L1_8x8;
2173 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2175 i_part_cost = i_part_cost_bi;
2176 a->i_mb_partition8x16[i] = D_BI_8x8;
2178 a->i_cost8x16bi += i_part_cost;
2180 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2184 a->i_mb_type8x16 = B_L0_L0
2185 + (a->i_mb_partition8x16[0]>>2) * 3
2186 + (a->i_mb_partition8x16[1]>>2);
2187 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2190 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2192 int thresh = i_satd * 5/4;
2194 h->mb.i_type = P_L0;
2195 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2197 h->mb.i_partition = D_16x16;
2198 x264_analyse_update_cache( h, a );
2199 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2202 if( a->l0.i_cost16x8 <= thresh )
2204 h->mb.i_partition = D_16x8;
2205 x264_analyse_update_cache( h, a );
2206 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2209 a->l0.i_cost16x8 = COST_MAX;
2211 if( a->l0.i_cost8x16 <= thresh )
2213 h->mb.i_partition = D_8x16;
2214 x264_analyse_update_cache( h, a );
2215 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2218 a->l0.i_cost8x16 = COST_MAX;
2220 if( a->l0.i_cost8x8 <= thresh )
2222 h->mb.i_type = P_8x8;
2223 h->mb.i_partition = D_8x8;
2224 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2227 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2228 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2229 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2230 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2231 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2232 * for future blocks are those left over from previous RDO calls. */
2233 for( i = 0; i < 4; i++ )
2235 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2236 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2237 int subtype, btype = D_L0_8x8;
2238 uint64_t bcost = COST_MAX64;
2239 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2242 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2244 h->mb.i_sub_partition[i] = subtype;
2245 x264_mb_cache_mv_p8x8( h, a, i );
2246 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2247 COPY2_IF_LT( bcost, cost, btype, subtype );
2249 if( h->mb.i_sub_partition[i] != btype )
2251 h->mb.i_sub_partition[i] = btype;
2252 x264_mb_cache_mv_p8x8( h, a, i );
2257 x264_analyse_update_cache( h, a );
2258 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2261 a->l0.i_cost8x8 = COST_MAX;
2264 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2266 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2268 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2270 h->mb.i_type = B_DIRECT;
2271 /* Assumes direct/skip MC is still in fdec */
2272 /* Requires b-rdo to be done before intra analysis */
2273 h->mb.b_skip_mc = 1;
2274 x264_analyse_update_cache( h, a );
2275 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2276 h->mb.b_skip_mc = 0;
2279 //FIXME not all the update_cache calls are needed
2280 h->mb.i_partition = D_16x16;
2282 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2284 h->mb.i_type = B_L0_L0;
2285 x264_analyse_update_cache( h, a );
2286 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2290 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2292 h->mb.i_type = B_L1_L1;
2293 x264_analyse_update_cache( h, a );
2294 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2298 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2300 h->mb.i_type = B_BI_BI;
2301 x264_analyse_update_cache( h, a );
2302 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2306 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2308 h->mb.i_type = B_8x8;
2309 h->mb.i_partition = D_8x8;
2310 x264_analyse_update_cache( h, a );
2311 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2312 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2316 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2318 h->mb.i_type = a->i_mb_type16x8;
2319 h->mb.i_partition = D_16x8;
2320 x264_analyse_update_cache( h, a );
2321 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2325 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2327 h->mb.i_type = a->i_mb_type8x16;
2328 h->mb.i_partition = D_8x16;
2329 x264_analyse_update_cache( h, a );
2330 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2334 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2339 if( IS_INTRA(h->mb.i_type) )
2342 switch( h->mb.i_partition )
2345 if( h->mb.i_type == B_BI_BI )
2347 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2348 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2352 for( i=0; i<2; i++ )
2353 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2355 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2356 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2360 for( i=0; i<2; i++ )
2361 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2363 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2364 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2368 for( i=0; i<4; i++ )
2369 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2371 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2372 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2378 static inline void x264_mb_analyse_transform( x264_t *h )
2380 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2382 int i_cost4, i_cost8;
2383 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2386 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2387 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2388 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2389 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2391 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2392 h->mb.b_skip_mc = 1;
2396 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2398 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2401 x264_analyse_update_cache( h, a );
2402 h->mb.b_transform_8x8 ^= 1;
2403 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2404 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2406 if( *i_rd >= i_rd8 )
2409 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2413 h->mb.b_transform_8x8 ^= 1;
2417 /* Rate-distortion optimal QP selection.
2418 * FIXME: More than half of the benefit of this function seems to be
2419 * in the way it improves the coding of chroma DC (by decimating or
2420 * finding a better way to code a single DC coefficient.)
2421 * There must be a more efficient way to get that portion of the benefit
2422 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2424 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2426 int bcost, cost, direction, failures, prevcost, origcost;
2427 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2428 int last_qp_tried = 0;
2429 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2430 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2432 /* If CBP is already zero, don't raise the quantizer any higher. */
2433 for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2435 /* Without psy-RD, require monotonicity when moving quant away from previous
2436 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2437 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2438 * allow 2 failures when moving quant towards previous quant.
2439 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2440 int threshold = (!!h->mb.i_psy_rd);
2441 /* Raise the threshold for failures if we're moving towards the last QP. */
2442 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2443 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2445 h->mb.i_qp = orig_qp;
2447 prevcost = origcost;
2449 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2450 * (up to a point) will too. So, jump down to where the threshold will kick in
2451 * and check the QP there. If the CBP is still empty, skip the main loop.
2452 * If it isn't empty, we would have ended up having to check this QP anyways,
2453 * so as long as we store it for later lookup, we lose nothing. */
2454 int already_checked_qp = -1;
2455 int already_checked_cost = COST_MAX;
2456 if( direction == -1 )
2460 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2461 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2462 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2463 if( !h->mb.cbp[h->mb.i_mb_xy] )
2465 /* If our empty-CBP block is lower QP than the last QP,
2466 * the last QP almost surely doesn't have a CBP either. */
2467 if( h->mb.i_last_qp > h->mb.i_qp )
2471 already_checked_qp = h->mb.i_qp;
2472 h->mb.i_qp = orig_qp;
2476 h->mb.i_qp += direction;
2477 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2479 if( h->mb.i_last_qp == h->mb.i_qp )
2481 if( h->mb.i_qp == already_checked_qp )
2482 cost = already_checked_cost;
2485 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2486 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2487 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2490 /* We can't assume that the costs are monotonic over QPs.
2491 * Tie case-as-failure seems to give better results. */
2492 if( cost < prevcost )
2498 if( failures > threshold )
2500 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2502 h->mb.i_qp += direction;
2506 /* Always try the last block's QP. */
2507 if( !last_qp_tried )
2509 h->mb.i_qp = h->mb.i_last_qp;
2510 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2511 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2512 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2516 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2518 /* Check transform again; decision from before may no longer be optimal. */
2519 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2520 x264_mb_transform_8x8_allowed( h ) )
2522 h->mb.b_transform_8x8 ^= 1;
2523 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2525 h->mb.b_transform_8x8 ^= 1;
2529 /*****************************************************************************
2530 * x264_macroblock_analyse:
2531 *****************************************************************************/
2532 void x264_macroblock_analyse( x264_t *h )
2534 x264_mb_analysis_t analysis;
2535 int i_cost = COST_MAX;
2538 h->mb.i_qp = x264_ratecontrol_qp( h );
2539 if( h->param.rc.i_aq_mode )
2541 x264_adaptive_quant( h );
2542 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2543 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2544 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2545 h->mb.i_qp = h->mb.i_last_qp;
2548 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2550 /*--------------------------- Do the analysis ---------------------------*/
2551 if( h->sh.i_type == SLICE_TYPE_I )
2554 if( analysis.i_mbrd )
2555 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2556 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2557 if( analysis.i_mbrd )
2558 x264_intra_rd( h, &analysis, COST_MAX );
2560 i_cost = analysis.i_satd_i16x16;
2561 h->mb.i_type = I_16x16;
2562 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2563 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2564 if( analysis.i_satd_pcm < i_cost )
2565 h->mb.i_type = I_PCM;
2567 else if( analysis.i_mbrd >= 2 )
2568 x264_intra_rd_refine( h, &analysis );
2570 else if( h->sh.i_type == SLICE_TYPE_P )
2574 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2576 analysis.b_try_skip = 0;
2577 if( analysis.b_force_intra )
2579 if( !h->param.analyse.b_psy )
2581 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2582 goto intra_analysis;
2587 /* Fast P_SKIP detection */
2588 if( h->param.analyse.b_fast_pskip )
2590 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2591 // FIXME don't need to check this if the reference frame is done
2593 else if( h->param.analyse.i_subpel_refine >= 3 )
2594 analysis.b_try_skip = 1;
2595 else if( h->mb.i_mb_type_left == P_SKIP ||
2596 h->mb.i_mb_type_top == P_SKIP ||
2597 h->mb.i_mb_type_topleft == P_SKIP ||
2598 h->mb.i_mb_type_topright == P_SKIP )
2599 b_skip = x264_macroblock_probe_pskip( h );
2603 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2607 h->mb.i_type = P_SKIP;
2608 h->mb.i_partition = D_16x16;
2609 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2610 /* Set up MVs for future predictors */
2612 for( i = 0; i < h->mb.pic.i_fref[0]; i++ )
2613 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2617 const unsigned int flags = h->param.analyse.inter;
2621 int i_satd_inter, i_satd_intra;
2623 x264_mb_analyse_load_costs( h, &analysis );
2625 x264_mb_analyse_inter_p16x16( h, &analysis );
2627 if( h->mb.i_type == P_SKIP )
2629 for( i = 1; i < h->mb.pic.i_fref[0]; i++ )
2630 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2634 if( flags & X264_ANALYSE_PSUB16x16 )
2636 if( h->param.analyse.b_mixed_references )
2637 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2639 x264_mb_analyse_inter_p8x8( h, &analysis );
2642 /* Select best inter mode */
2644 i_partition = D_16x16;
2645 i_cost = analysis.l0.me16x16.cost;
2647 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2648 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2651 i_partition = D_8x8;
2652 i_cost = analysis.l0.i_cost8x8;
2655 if( flags & X264_ANALYSE_PSUB8x8 )
2657 for( i = 0; i < 4; i++ )
2659 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2660 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2662 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2663 h->mb.i_sub_partition[i] = D_L0_4x4;
2665 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2666 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2667 h->mb.i_sub_partition[i], D_L0_8x4 );
2669 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2670 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2671 h->mb.i_sub_partition[i], D_L0_4x8 );
2673 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2675 x264_mb_cache_mv_p8x8( h, &analysis, i );
2677 analysis.l0.i_cost8x8 = i_cost;
2681 /* Now do 16x8/8x16 */
2682 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2683 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2684 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2686 x264_mb_analyse_inter_p16x8( h, &analysis );
2687 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2689 x264_mb_analyse_inter_p8x16( h, &analysis );
2690 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2693 h->mb.i_partition = i_partition;
2696 //FIXME mb_type costs?
2697 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2701 else if( i_partition == D_16x16 )
2703 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2704 i_cost = analysis.l0.me16x16.cost;
2706 else if( i_partition == D_16x8 )
2708 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2709 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2710 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2712 else if( i_partition == D_8x16 )
2714 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2715 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2716 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2718 else if( i_partition == D_8x8 )
2722 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2724 switch( h->mb.i_sub_partition[i8x8] )
2727 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2728 i_cost += analysis.l0.me8x8[i8x8].cost;
2731 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2732 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2733 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2734 analysis.l0.me8x4[i8x8][1].cost;
2737 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2738 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2739 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2740 analysis.l0.me4x8[i8x8][1].cost;
2744 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2745 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2746 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2747 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2748 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2749 analysis.l0.me4x4[i8x8][1].cost +
2750 analysis.l0.me4x4[i8x8][2].cost +
2751 analysis.l0.me4x4[i8x8][3].cost;
2754 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2760 if( h->mb.b_chroma_me )
2762 x264_mb_analyse_intra_chroma( h, &analysis );
2763 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2764 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2765 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2766 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2769 x264_mb_analyse_intra( h, &analysis, i_cost );
2771 i_satd_inter = i_cost;
2772 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2773 analysis.i_satd_i8x8,
2774 analysis.i_satd_i4x4 );
2776 if( analysis.i_mbrd )
2778 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2780 i_partition = D_16x16;
2781 i_cost = analysis.l0.i_rd16x16;
2782 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2783 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2784 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2785 h->mb.i_type = i_type;
2786 h->mb.i_partition = i_partition;
2787 if( i_cost < COST_MAX )
2788 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2789 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2792 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2793 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2794 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2795 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2797 h->mb.i_type = i_type;
2799 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2801 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2802 * it was an inter block. */
2803 x264_analyse_update_cache( h, &analysis );
2804 x264_macroblock_encode( h );
2805 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2806 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2807 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2808 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2809 goto intra_analysis;
2812 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2814 if( IS_INTRA( h->mb.i_type ) )
2816 x264_intra_rd_refine( h, &analysis );
2818 else if( i_partition == D_16x16 )
2820 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2821 analysis.l0.me16x16.cost = i_cost;
2822 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2824 else if( i_partition == D_16x8 )
2826 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2827 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2828 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2829 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2830 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2831 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2833 else if( i_partition == D_8x16 )
2835 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2836 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2837 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2838 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2839 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2840 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2842 else if( i_partition == D_8x8 )
2845 x264_analyse_update_cache( h, &analysis );
2846 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2848 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2850 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2852 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2854 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2855 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2857 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2859 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2860 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2862 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2864 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2865 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2866 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2867 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2874 else if( h->sh.i_type == SLICE_TYPE_B )
2876 int i_bskip_cost = COST_MAX;
2879 if( analysis.i_mbrd )
2880 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2882 h->mb.i_type = B_SKIP;
2883 if( h->mb.b_direct_auto_write )
2885 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2886 for( i = 0; i < 2; i++ )
2889 h->sh.b_direct_spatial_mv_pred ^= 1;
2890 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2891 if( analysis.b_direct_available )
2896 b_skip = x264_macroblock_probe_bskip( h );
2898 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2905 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2907 analysis.b_try_skip = 0;
2908 if( analysis.b_direct_available )
2910 if( !h->mb.b_direct_auto_write )
2912 if( analysis.i_mbrd )
2914 i_bskip_cost = ssd_mb( h );
2915 /* 6 = minimum cavlc cost of a non-skipped MB */
2916 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2918 else if( !h->mb.b_direct_auto_write )
2920 /* Conditioning the probe on neighboring block types
2921 * doesn't seem to help speed or quality. */
2922 analysis.b_try_skip = x264_macroblock_probe_bskip( h );
2923 if( h->param.analyse.i_subpel_refine < 3 )
2924 b_skip = analysis.b_try_skip;
2926 /* Set up MVs for future predictors */
2929 for( i = 0; i < h->mb.pic.i_fref[0]; i++ )
2930 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2931 for( i = 0; i < h->mb.pic.i_fref[1]; i++ )
2932 M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
2938 const unsigned int flags = h->param.analyse.inter;
2942 h->mb.b_skip_mc = 0;
2943 h->mb.i_type = B_DIRECT;
2945 x264_mb_analyse_load_costs( h, &analysis );
2947 /* select best inter mode */
2948 /* direct must be first */
2949 if( analysis.b_direct_available )
2950 x264_mb_analyse_inter_direct( h, &analysis );
2952 x264_mb_analyse_inter_b16x16( h, &analysis );
2954 if( h->mb.i_type == B_SKIP )
2956 for( i = 1; i < h->mb.pic.i_fref[0]; i++ )
2957 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2958 for( i = 1; i < h->mb.pic.i_fref[1]; i++ )
2959 M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
2964 i_partition = D_16x16;
2965 i_cost = analysis.l0.me16x16.cost;
2966 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2967 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2968 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2970 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2972 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2973 if( i_bskip_cost < analysis.i_rd16x16direct &&
2974 i_bskip_cost < analysis.i_rd16x16bi &&
2975 i_bskip_cost < analysis.l0.i_rd16x16 &&
2976 i_bskip_cost < analysis.l1.i_rd16x16 )
2978 h->mb.i_type = B_SKIP;
2979 x264_analyse_update_cache( h, &analysis );
2984 if( flags & X264_ANALYSE_BSUB16x16 )
2986 if( h->param.analyse.b_mixed_references )
2987 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
2989 x264_mb_analyse_inter_b8x8( h, &analysis );
2991 if( analysis.i_cost8x8bi < i_cost )
2994 i_partition = D_8x8;
2995 i_cost = analysis.i_cost8x8bi;
2997 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2998 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
3000 x264_mb_analyse_inter_b16x8( h, &analysis );
3001 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
3002 i_type, analysis.i_mb_type16x8,
3003 i_partition, D_16x8 );
3005 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
3006 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
3008 x264_mb_analyse_inter_b8x16( h, &analysis );
3009 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
3010 i_type, analysis.i_mb_type8x16,
3011 i_partition, D_8x16 );
3016 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
3021 else if( i_partition == D_16x16 )
3023 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3024 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3025 if( i_type == B_L0_L0 )
3027 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
3028 i_cost = analysis.l0.me16x16.cost
3029 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
3031 else if( i_type == B_L1_L1 )
3033 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
3034 i_cost = analysis.l1.me16x16.cost
3035 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
3037 else if( i_type == B_BI_BI )
3039 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
3040 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
3043 else if( i_partition == D_16x8 )
3045 for( i=0; i<2; i++ )
3047 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
3048 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
3049 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
3050 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
3053 else if( i_partition == D_8x16 )
3055 for( i=0; i<2; i++ )
3057 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
3058 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
3059 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
3060 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
3063 else if( i_partition == D_8x8 )
3065 for( i=0; i<4; i++ )
3068 int i_part_cost_old;
3070 int i_part_type = h->mb.i_sub_partition[i];
3071 int b_bidir = (i_part_type == D_BI_8x8);
3073 if( i_part_type == D_DIRECT_8x8 )
3075 if( x264_mb_partition_listX_table[0][i_part_type] )
3077 m = &analysis.l0.me8x8[i];
3078 i_part_cost_old = m->cost;
3079 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3080 m->cost -= i_type_cost;
3081 x264_me_refine_qpel( h, m );
3083 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3085 if( x264_mb_partition_listX_table[1][i_part_type] )
3087 m = &analysis.l1.me8x8[i];
3088 i_part_cost_old = m->cost;
3089 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3090 m->cost -= i_type_cost;
3091 x264_me_refine_qpel( h, m );
3093 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3095 /* TODO: update mvp? */
3099 i_satd_inter = i_cost;
3101 if( analysis.i_mbrd )
3103 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3105 i_cost = i_bskip_cost;
3106 i_partition = D_16x16;
3107 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3108 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3109 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3110 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3111 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3112 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3113 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3115 h->mb.i_type = i_type;
3116 h->mb.i_partition = i_partition;
3119 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3121 if( analysis.i_mbrd )
3123 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3124 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3127 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3128 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3129 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3130 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3132 h->mb.i_type = i_type;
3133 h->mb.i_partition = i_partition;
3135 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3136 x264_intra_rd_refine( h, &analysis );
3137 if( h->mb.i_subpel_refine >= 5 )
3138 x264_refine_bidir( h, &analysis );
3140 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3143 x264_analyse_update_cache( h, &analysis );
3145 if( i_partition == D_16x16 )
3147 if( i_type == B_L0_L0 )
3149 analysis.l0.me16x16.cost = i_cost;
3150 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3152 else if( i_type == B_L1_L1 )
3154 analysis.l1.me16x16.cost = i_cost;
3155 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3157 else if( i_type == B_BI_BI )
3159 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3160 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3163 else if( i_partition == D_16x8 )
3165 for( i = 0; i < 2; i++ )
3167 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3168 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3169 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3170 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3171 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3172 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3174 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3175 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3179 else if( i_partition == D_8x16 )
3181 for( i = 0; i < 2; i++ )
3183 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3184 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3185 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3186 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3187 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3188 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3190 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3191 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3195 else if( i_partition == D_8x8 )
3197 for( i = 0; i < 4; i++ )
3199 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3200 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3201 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3202 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3203 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3205 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3206 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3214 x264_analyse_update_cache( h, &analysis );
3216 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3217 * without realizing it. Check for this and account for it if necessary. */
3218 if( analysis.i_mbrd >= 2 )
3220 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3221 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3222 int list = check_mv_lists[h->mb.i_type] - 1;
3223 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3224 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3225 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3226 h->mb.i_partition = D_16x16;
3229 if( !analysis.i_mbrd )
3230 x264_mb_analyse_transform( h );
3232 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3233 x264_mb_analyse_qp_rd( h, &analysis );
3235 h->mb.b_trellis = h->param.analyse.i_trellis;
3236 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3237 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3238 x264_psy_trellis_init( h, 0 );
3239 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3240 h->mb.i_skip_intra = 0;
3243 /*-------------------- Update MB from the analysis ----------------------*/
3244 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3248 switch( h->mb.i_type )
3251 for( i = 0; i < 16; i++ )
3252 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3254 x264_mb_analyse_intra_chroma( h, a );
3257 for( i = 0; i < 4; i++ )
3258 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3260 x264_mb_analyse_intra_chroma( h, a );
3263 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3264 x264_mb_analyse_intra_chroma( h, a );
3271 switch( h->mb.i_partition )
3274 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3275 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3279 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3280 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3281 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3282 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3286 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3287 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3288 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3289 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3293 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3299 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3300 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3301 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3302 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3303 for( i = 0; i < 4; i++ )
3304 x264_mb_cache_mv_p8x8( h, a, i );
3309 h->mb.i_partition = D_16x16;
3310 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3311 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3317 h->mb.i_partition = h->mb.cache.direct_partition;
3318 x264_mb_load_mv_direct8x8( h, 0 );
3319 x264_mb_load_mv_direct8x8( h, 1 );
3320 x264_mb_load_mv_direct8x8( h, 2 );
3321 x264_mb_load_mv_direct8x8( h, 3 );
3325 /* optimize: cache might not need to be rewritten */
3326 for( i = 0; i < 4; i++ )
3327 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3330 default: /* the rest of the B types */
3331 switch( h->mb.i_partition )
3334 switch( h->mb.i_type )
3337 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3338 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3340 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3341 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3342 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3345 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3346 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3347 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3349 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3350 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3353 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3354 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3356 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3357 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3362 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3363 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3366 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3367 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3370 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3376 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3379 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3382 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3385 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3386 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3388 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3389 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3390 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3391 h->mb.cache.mv[l][x264_scan8[15]][0],
3392 h->mb.cache.mv[l][x264_scan8[15]][1] );
3393 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3394 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3395 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3396 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3397 x264_mb_analyse_intra( h, a, COST_MAX );
3398 h->mb.i_type = I_16x16;
3399 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3400 x264_mb_analyse_intra_chroma( h, a );
3407 #include "slicetype.c"