1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[7];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
125 int i_mb_partition16x8[2]; /* mb_partition_e */
126 int i_mb_partition8x16[2];
127 int i_mb_type16x8; /* mb_class_e */
130 int b_direct_available;
132 } x264_mb_analysis_t;
134 /* lambda = pow(2,qp/6-2) */
135 const int x264_lambda_tab[52] = {
136 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
137 1, 1, 1, 1, /* 8-11 */
138 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
139 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
140 6, 7, 8, 9,10,11,13,14, /* 28-35 */
141 16,18,20,23,25,29,32,36, /* 36-43 */
142 40,45,51,57,64,72,81,91 /* 44-51 */
145 /* lambda2 = pow(lambda,2) * .9 * 256 */
146 const int x264_lambda2_tab[52] = {
147 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
148 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
149 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
150 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
151 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
152 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
153 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
156 const uint8_t x264_exp2_lut[64] = {
157 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
158 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
159 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
160 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
163 const float x264_log2_lut[128] = {
164 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
165 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
166 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
167 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
168 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
169 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
170 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
171 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
172 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
173 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
174 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
175 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
176 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
177 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
178 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
179 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
182 /* Avoid an int/float conversion. */
183 const float x264_log2_lz_lut[32] = {
184 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
187 // should the intra and inter lambdas be different?
188 // I'm just matching the behaviour of deadzone quant.
189 static const int x264_trellis_lambda2_tab[2][52] = {
190 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
191 { 46, 58, 73, 92, 117, 147,
192 185, 233, 294, 370, 466, 587,
193 740, 932, 1174, 1480, 1864, 2349,
194 2959, 3728, 4697, 5918, 7457, 9395,
195 11837, 14914, 18790, 23674, 29828, 37581,
196 47349, 59656, 75163, 94699, 119313, 150326,
197 189399, 238627, 300652, 378798, 477255, 601304,
198 757596, 954511, 1202608, 1515192, 1909022, 2405217,
199 3030384, 3818045, 4810435, 6060769 },
200 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
201 { 27, 34, 43, 54, 68, 86,
202 108, 136, 172, 216, 273, 343,
203 433, 545, 687, 865, 1090, 1374,
204 1731, 2180, 2747, 3461, 4361, 5494,
205 6922, 8721, 10988, 13844, 17442, 21976,
206 27688, 34885, 43953, 55377, 69771, 87906,
207 110755, 139543, 175813, 221511, 279087, 351627,
208 443023, 558174, 703255, 886046, 1116348, 1406511,
209 1772093, 2232697, 2813022, 3544186 }
212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
213 16, 20, 25, 32, 40, 50,
214 64, 80, 101, 128, 161, 203,
215 256, 322, 406, 512, 645, 812,
216 1024, 1290, 1625, 2048, 2580, 3250,
217 4096, 5160, 6501, 8192, 10321, 13003,
218 16384, 20642, 26007, 32768, 41285, 52015,
222 /* TODO: calculate CABAC costs */
223 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
224 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
226 static const int i_mb_b16x8_cost_table[17] = {
227 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
229 static const int i_sub_mb_b_cost_table[13] = {
230 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
232 static const int i_sub_mb_p_cost_table[4] = {
236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
238 static uint16_t x264_cost_ref[92][3][33];
239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
241 int x264_analyse_init_costs( x264_t *h, int qp )
244 int lambda = x264_lambda_tab[qp];
245 if( h->cost_mv[lambda] )
247 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
248 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
249 h->cost_mv[lambda] += 2*4*2048;
250 for( i = 0; i <= 2*4*2048; i++ )
252 h->cost_mv[lambda][-i] =
253 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
255 x264_pthread_mutex_lock( &cost_ref_mutex );
256 for( i = 0; i < 3; i++ )
257 for( j = 0; j < 33; j++ )
258 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
259 x264_pthread_mutex_unlock( &cost_ref_mutex );
260 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
264 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
265 h->cost_mv_fpel[lambda][j] += 2*2048;
266 for( i = -2*2048; i < 2*2048; i++ )
267 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
275 void x264_analyse_free_costs( x264_t *h )
278 for( i = 0; i < 92; i++ )
281 x264_free( h->cost_mv[i] - 2*4*2048 );
282 if( h->cost_mv_fpel[i][0] )
283 for( j = 0; j < 4; j++ )
284 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
288 void x264_analyse_weight_frame( x264_t *h, int end )
291 for( j=0; j<h->i_ref0; j++ )
293 if( h->sh.weight[j][0].weightfn )
295 x264_frame_t *frame = h->fref0[j];
296 int width = frame->i_width[0] + 2*PADH;
297 int i_padv = PADV << h->param.b_interlaced;
299 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
301 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
302 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
303 h->fenc->i_lines_weighted += height;
306 for( k = j; k < h->i_ref0; k++ )
307 if( h->sh.weight[k][0].weightfn )
309 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
310 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
311 src + offset, frame->i_stride[0],
312 width, height, &h->sh.weight[k][0] );
320 /* initialize an array of lambda*nbits for all possible mvs */
321 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
323 a->p_cost_mv = h->cost_mv[a->i_lambda];
324 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
325 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
328 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
330 /* conduct the analysis using this lamda and QP */
331 a->i_qp = h->mb.i_qp = i_qp;
332 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
334 a->i_lambda = x264_lambda_tab[i_qp];
335 a->i_lambda2 = x264_lambda2_tab[i_qp];
337 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
338 if( h->param.analyse.i_trellis )
340 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
341 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
342 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
343 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
345 h->mb.i_psy_rd_lambda = a->i_lambda;
346 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
347 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
351 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
353 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
355 /* mbrd == 1 -> RD mode decision */
356 /* mbrd == 2 -> RD refinement */
357 /* mbrd == 3 -> QPRD */
358 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
360 x264_mb_analyse_init_qp( h, a, i_qp );
362 h->mb.i_me_method = h->param.analyse.i_me_method;
363 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
364 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
365 && h->mb.i_subpel_refine >= 5;
367 h->mb.b_transform_8x8 = 0;
368 h->mb.b_noise_reduction = 0;
374 a->i_satd_i8x8chroma = COST_MAX;
376 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
377 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
381 h->mb.b_lossless ? 0 :
383 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
385 /* II: Inter part P/B frame */
386 if( h->sh.i_type != SLICE_TYPE_I )
389 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
390 // limit motion search to a slightly smaller range than the theoretical limit,
391 // since the search may go a few iterations past its given range
392 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
394 /* Calculate max allowed MV range */
395 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
396 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
397 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
398 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
399 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
400 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
402 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
403 int max_mv = max_x - 4*16*h->mb.i_mb_x;
404 /* If we're left of the refresh bar, don't reference right of it. */
405 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
406 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
408 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
409 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
410 if( h->mb.i_mb_x == 0 )
412 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
413 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
414 int thread_mvy_range = i_fmv_range;
416 if( h->param.i_threads > 1 && !h->param.b_sliced_threads )
418 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
419 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
420 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
422 x264_frame_t **fref = i ? h->fref1 : h->fref0;
423 int i_ref = i ? h->i_ref1 : h->i_ref0;
424 for( j=0; j<i_ref; j++ )
426 x264_frame_cond_wait( fref[j]->orig, thresh );
427 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
431 if( h->param.b_deterministic )
432 thread_mvy_range = h->param.analyse.i_mv_range_thread;
433 if( h->mb.b_interlaced )
434 thread_mvy_range >>= 1;
436 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
439 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
440 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
441 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
442 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
443 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
444 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
445 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
451 a->l0.i_cost8x8 = COST_MAX;
453 for( i = 0; i < 4; i++ )
457 a->l0.i_cost4x8[i] = COST_MAX;
461 a->l0.i_cost8x16 = COST_MAX;
462 if( h->sh.i_type == SLICE_TYPE_B )
466 a->l1.i_cost8x8 = COST_MAX;
468 for( i = 0; i < 4; i++ )
473 a->i_cost8x8direct[i] = COST_MAX;
484 a->i_cost16x16direct =
487 a->i_cost8x16bi = COST_MAX;
490 /* Fast intra decision */
491 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
493 if( IS_INTRA( h->mb.i_mb_type_left )
494 || IS_INTRA( h->mb.i_mb_type_top )
495 || IS_INTRA( h->mb.i_mb_type_topleft )
496 || IS_INTRA( h->mb.i_mb_type_topright )
497 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
498 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
499 { /* intra is likely */ }
506 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
507 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
509 a->b_force_intra = 1;
513 a->b_force_intra = 0;
517 /* Prediction modes allowed for various combinations of neighbors. */
518 /* Terminated by a -1. */
519 /* In order, no neighbors, left, top, top/left, top/left/topleft */
520 static const int8_t i16x16_mode_available[5][5] =
522 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
523 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
524 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
525 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
526 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
529 static const int8_t i8x8chroma_mode_available[5][5] =
531 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
532 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
533 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
534 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
535 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
538 static const int8_t i4x4_mode_available[5][10] =
540 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
541 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
542 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
543 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
544 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
547 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
549 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
550 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
553 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
555 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
556 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
559 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
561 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
562 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
565 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
566 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
568 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
570 if( do_both_dct || h->mb.b_transform_8x8 )
571 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
572 if( do_both_dct || !h->mb.b_transform_8x8 )
573 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
576 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
577 static inline void x264_mb_cache_fenc_satd( x264_t *h )
579 ALIGNED_16( static uint8_t zero[16] ) = {0};
581 int x, y, satd_sum = 0, sa8d_sum = 0;
582 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
583 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
584 if( !h->mb.i_psy_rd )
586 for( y = 0; y < 4; y++ )
587 for( x = 0; x < 4; x++ )
589 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
590 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
591 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
592 satd_sum += h->mb.pic.fenc_satd[y][x];
594 for( y = 0; y < 2; y++ )
595 for( x = 0; x < 2; x++ )
597 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
598 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
599 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
600 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
602 h->mb.pic.fenc_satd_sum = satd_sum;
603 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
606 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
608 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
610 if( a->i_satd_i8x8chroma < COST_MAX )
613 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
615 /* 8x8 prediction selection for chroma */
616 if( predict_mode[3] >= 0 && b_merged_satd )
618 int satdu[4], satdv[4];
619 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
620 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
621 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
622 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
623 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
624 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
626 for( ; *predict_mode >= 0; predict_mode++ )
628 int i_mode = *predict_mode;
629 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
631 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
632 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
637 for( ; *predict_mode >= 0; predict_mode++ )
640 int i_mode = *predict_mode;
642 /* we do the prediction */
643 if( h->mb.b_lossless )
644 x264_predict_lossless_8x8_chroma( h, i_mode );
647 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
648 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
651 /* we calculate the cost */
652 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
653 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
654 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
656 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
657 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
661 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
664 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
666 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
667 uint8_t *p_src = h->mb.pic.p_fenc[0];
668 uint8_t *p_dst = h->mb.pic.p_fdec[0];
671 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
673 /*---------------- Try all mode and calculate their score ---------------*/
675 /* 16x16 prediction selection */
676 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
678 if( b_merged_satd && predict_mode[3] >= 0 )
680 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
681 h->predict_16x16[I_PRED_16x16_P]( p_dst );
682 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
683 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
686 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
687 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
692 for( ; *predict_mode >= 0; predict_mode++ )
695 int i_mode = *predict_mode;
697 if( h->mb.b_lossless )
698 x264_predict_lossless_16x16( h, i_mode );
700 h->predict_16x16[i_mode]( p_dst );
702 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
703 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
704 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
705 a->i_satd_i16x16_dir[i_mode] = i_satd;
709 if( h->sh.i_type == SLICE_TYPE_B )
710 /* cavlc mb type prefix */
711 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
712 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
715 /* 8x8 prediction selection */
716 if( flags & X264_ANALYSE_I8x8 )
718 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
719 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
720 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
722 h->mb.i_cbp_luma = 0;
723 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
725 // FIXME some bias like in i4x4?
726 if( h->sh.i_type == SLICE_TYPE_B )
727 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
729 for( idx = 0;; idx++ )
733 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
734 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
735 int i_best = COST_MAX;
736 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
738 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
739 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
741 if( b_merged_satd && predict_mode[8] >= 0 )
744 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
745 satd[i_pred_mode] -= 3 * a->i_lambda;
746 for( i=2; i>=0; i-- )
748 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
749 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
754 for( ; *predict_mode >= 0; predict_mode++ )
757 int i_mode = *predict_mode;
759 if( h->mb.b_lossless )
760 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
762 h->predict_8x8[i_mode]( p_dst_by, edge );
764 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
765 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
766 i_satd -= a->i_lambda * 3;
768 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
769 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
773 if( idx == 3 || i_cost > i_satd_thresh )
776 /* we need to encode this block now (for next ones) */
777 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
778 x264_mb_encode_i8x8( h, idx, a->i_qp );
780 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
785 a->i_satd_i8x8 = i_cost;
786 if( h->mb.i_skip_intra )
788 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
789 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
790 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
791 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
792 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
793 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
794 if( h->mb.i_skip_intra == 2 )
795 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
800 static const uint16_t cost_div_fix8[3] = {1024,512,341};
801 a->i_satd_i8x8 = COST_MAX;
802 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
804 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
808 /* 4x4 prediction selection */
809 if( flags & X264_ANALYSE_I4x4 )
812 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
813 h->mb.i_cbp_luma = 0;
814 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
816 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
818 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
819 if( h->sh.i_type == SLICE_TYPE_B )
820 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
822 for( idx = 0;; idx++ )
824 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
825 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
826 int i_best = COST_MAX;
827 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
829 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
831 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
832 /* emulate missing topright samples */
833 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
835 if( b_merged_satd && predict_mode[5] >= 0 )
838 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
839 satd[i_pred_mode] -= 3 * a->i_lambda;
840 for( i=2; i>=0; i-- )
841 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
845 for( ; *predict_mode >= 0; predict_mode++ )
848 int i_mode = *predict_mode;
850 if( h->mb.b_lossless )
851 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
853 h->predict_4x4[i_mode]( p_dst_by );
855 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
856 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
857 i_satd -= a->i_lambda * 3;
859 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
861 i_cost += i_best + 4 * a->i_lambda;
863 if( i_cost > i_satd_thresh || idx == 15 )
866 /* we need to encode this block now (for next ones) */
867 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
868 x264_mb_encode_i4x4( h, idx, a->i_qp );
870 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
874 a->i_satd_i4x4 = i_cost;
875 if( h->mb.i_skip_intra )
877 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
878 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
879 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
880 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
881 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
882 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
883 if( h->mb.i_skip_intra == 2 )
884 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
888 a->i_satd_i4x4 = COST_MAX;
892 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
894 if( a->i_satd_i16x16 <= i_satd_thresh )
896 h->mb.i_type = I_16x16;
897 x264_analyse_update_cache( h, a );
898 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
901 a->i_satd_i16x16 = COST_MAX;
903 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
905 h->mb.i_type = I_4x4;
906 x264_analyse_update_cache( h, a );
907 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
910 a->i_satd_i4x4 = COST_MAX;
912 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
914 h->mb.i_type = I_8x8;
915 x264_analyse_update_cache( h, a );
916 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
917 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
920 a->i_satd_i8x8 = COST_MAX;
923 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
925 uint8_t *p_dst = h->mb.pic.p_fdec[0];
928 int i_mode, i_thresh;
929 uint64_t i_satd, i_best;
930 h->mb.i_skip_intra = 0;
932 if( h->mb.i_type == I_16x16 )
934 int old_pred_mode = a->i_predict16x16;
935 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
936 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
937 i_best = a->i_satd_i16x16;
938 for( ; *predict_mode >= 0; predict_mode++ )
940 int i_mode = *predict_mode;
941 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
943 h->mb.i_intra16x16_pred_mode = i_mode;
944 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
945 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
949 /* RD selection for chroma prediction */
950 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
951 if( predict_mode[1] >= 0 )
953 int8_t predict_mode_sorted[4];
955 i_thresh = a->i_satd_i8x8chroma * 5/4;
957 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
959 i_mode = *predict_mode;
960 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
961 predict_mode_sorted[i_max++] = i_mode;
966 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
967 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
968 /* the previous thing encoded was x264_intra_rd(), so the pixels and
969 * coefs for the current chroma mode are still around, so we only
970 * have to recount the bits. */
971 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
972 for( i = 0; i < i_max; i++ )
974 i_mode = predict_mode_sorted[i];
975 if( h->mb.b_lossless )
976 x264_predict_lossless_8x8_chroma( h, i_mode );
979 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
980 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
982 /* if we've already found a mode that needs no residual, then
983 * probably any mode with a residual will be worse.
984 * so avoid dct on the remaining modes to improve speed. */
985 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
986 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
988 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
989 h->mb.i_cbp_chroma = i_cbp_chroma_best;
993 if( h->mb.i_type == I_4x4 )
995 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
997 for( idx = 0; idx < 16; idx++ )
999 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1000 i_best = COST_MAX64;
1002 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
1004 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1005 /* emulate missing topright samples */
1006 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1008 for( ; *predict_mode >= 0; predict_mode++ )
1010 i_mode = *predict_mode;
1011 if( h->mb.b_lossless )
1012 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1014 h->predict_4x4[i_mode]( p_dst_by );
1015 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1017 if( i_best > i_satd )
1019 a->i_predict4x4[idx] = i_mode;
1021 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1022 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1023 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1024 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1025 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1029 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1030 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1031 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1032 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1033 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1035 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1038 else if( h->mb.i_type == I_8x8 )
1040 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1041 for( idx = 0; idx < 4; idx++ )
1043 uint64_t pels_h = 0;
1045 uint16_t i_nnz[2] = {0}; //shut up gcc
1048 int cbp_luma_new = 0;
1049 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1051 i_best = COST_MAX64;
1055 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1056 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1057 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1059 for( ; *predict_mode >= 0; predict_mode++ )
1061 i_mode = *predict_mode;
1062 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1065 if( h->mb.b_lossless )
1066 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1068 h->predict_8x8[i_mode]( p_dst_by, edge );
1069 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1070 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1072 if( i_best > i_satd )
1074 a->i_predict8x8[idx] = i_mode;
1075 cbp_luma_new = h->mb.i_cbp_luma;
1078 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1080 for( j=0; j<7; j++ )
1081 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1082 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1083 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1086 a->i_cbp_i8x8_luma = cbp_luma_new;
1087 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1089 for( j=0; j<7; j++ )
1090 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1091 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1092 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1094 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1099 #define LOAD_FENC( m, src, xoff, yoff) \
1100 (m)->p_cost_mv = a->p_cost_mv; \
1101 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1102 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1103 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1104 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1105 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1107 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1108 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1109 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1110 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1111 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1112 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1113 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1114 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1115 (m)->weight = weight_none; \
1118 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1119 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1120 (m)->weight = h->sh.weight[i_ref];
1122 #define REF_COST(list, ref) \
1123 (a->p_cost_ref[list][ref])
1125 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1129 ALIGNED_4( int16_t mvc[8][2] );
1130 int i_halfpel_thresh = INT_MAX;
1131 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1133 /* 16x16 Search on all ref frame */
1134 m.i_pixel = PIXEL_16x16;
1135 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1137 a->l0.me16x16.cost = INT_MAX;
1138 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1140 const int i_ref_cost = REF_COST( 0, i_ref );
1141 i_halfpel_thresh -= i_ref_cost;
1142 m.i_ref_cost = i_ref_cost;
1144 /* search with ref */
1145 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1146 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1148 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1149 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1151 if( h->mb.ref_blind_dupe == i_ref )
1153 CP32( m.mv, a->l0.mvc[0][0] );
1154 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1157 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1159 /* early termination
1160 * SSD threshold would probably be better than SATD */
1163 && m.cost-m.cost_mv < 300*a->i_lambda
1164 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1165 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1166 && x264_macroblock_probe_pskip( h ) )
1168 h->mb.i_type = P_SKIP;
1169 x264_analyse_update_cache( h, a );
1170 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
1174 m.cost += i_ref_cost;
1175 i_halfpel_thresh += i_ref_cost;
1177 if( m.cost < a->l0.me16x16.cost )
1178 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1180 /* save mv for predicting neighbors */
1181 CP32( a->l0.mvc[i_ref][0], m.mv );
1182 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1185 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1186 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
1188 h->mb.i_type = P_L0;
1191 x264_mb_cache_fenc_satd( h );
1192 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1194 h->mb.i_partition = D_16x16;
1195 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1196 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1197 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1198 h->mb.i_type = P_SKIP;
1203 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1207 uint8_t **p_fenc = h->mb.pic.p_fenc;
1208 int i_halfpel_thresh = INT_MAX;
1209 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1211 int i_maxref = h->mb.pic.i_fref[0]-1;
1213 h->mb.i_partition = D_8x8;
1215 #define CHECK_NEIGHBOUR(i)\
1217 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1218 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1222 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1223 * than those used by the neighbors */
1224 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1225 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1228 CHECK_NEIGHBOUR( -8 - 1 );
1229 CHECK_NEIGHBOUR( -8 + 0 );
1230 CHECK_NEIGHBOUR( -8 + 2 );
1231 CHECK_NEIGHBOUR( -8 + 4 );
1232 CHECK_NEIGHBOUR( 0 - 1 );
1233 CHECK_NEIGHBOUR( 2*8 - 1 );
1236 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1237 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1239 for( i = 0; i < 4; i++ )
1241 x264_me_t *l0m = &a->l0.me8x8[i];
1245 m.i_pixel = PIXEL_8x8;
1247 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1248 l0m->cost = INT_MAX;
1249 for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1251 const int i_ref_cost = REF_COST( 0, i_ref );
1252 m.i_ref_cost = i_ref_cost;
1254 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1255 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1257 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1258 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1259 if( h->mb.ref_blind_dupe == i_ref )
1261 CP32( m.mv, a->l0.mvc[0][i+1] );
1262 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1265 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1267 m.cost += i_ref_cost;
1268 i_halfpel_thresh += i_ref_cost;
1269 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1271 if( m.cost < l0m->cost )
1272 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1273 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1274 i_ref = h->mb.ref_blind_dupe;
1278 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1279 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1281 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1282 are effectively zero. */
1283 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1284 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1287 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1288 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1289 /* P_8x8 ref0 has no ref cost */
1290 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1291 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1292 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1293 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1294 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1297 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1299 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1300 * reference frame flags. Thus, if we're not doing mixedrefs, just
1301 * don't bother analysing the dupes. */
1302 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1303 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1304 uint8_t **p_fenc = h->mb.pic.p_fenc;
1306 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1309 /* XXX Needed for x264_mb_predict_mv */
1310 h->mb.i_partition = D_8x8;
1313 CP32( mvc[0], a->l0.me16x16.mv );
1315 for( i = 0; i < 4; i++ )
1317 x264_me_t *m = &a->l0.me8x8[i];
1321 m->i_pixel = PIXEL_8x8;
1322 m->i_ref_cost = i_ref_cost;
1324 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1325 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1326 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1328 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1329 x264_me_search( h, m, mvc, i_mvc );
1331 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1333 CP32( mvc[i_mvc], m->mv );
1337 m->cost += i_ref_cost;
1338 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1339 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1342 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1343 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1344 /* theoretically this should include 4*ref_cost,
1345 * but 3 seems a better approximation of cabac. */
1346 if( h->param.b_cabac )
1347 a->l0.i_cost8x8 -= i_ref_cost;
1348 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1349 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1352 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1355 uint8_t **p_fenc = h->mb.pic.p_fenc;
1356 ALIGNED_4( int16_t mvc[3][2] );
1359 /* XXX Needed for x264_mb_predict_mv */
1360 h->mb.i_partition = D_16x8;
1362 for( i = 0; i < 2; i++ )
1364 x264_me_t *l0m = &a->l0.me16x8[i];
1365 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1366 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1367 const int ref8[2] = { minref, maxref };
1368 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1370 m.i_pixel = PIXEL_16x8;
1372 LOAD_FENC( &m, p_fenc, 0, 8*i );
1373 l0m->cost = INT_MAX;
1374 for( j = 0; j < i_ref8s; j++ )
1376 const int i_ref = ref8[j];
1377 const int i_ref_cost = REF_COST( 0, i_ref );
1378 m.i_ref_cost = i_ref_cost;
1380 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1381 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1382 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1383 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1385 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1386 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1388 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1389 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1390 /* We can only take this shortcut if the first search was performed on ref0. */
1391 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1393 /* We can just leave the MV from the previous ref search. */
1394 x264_me_refine_qpel_refdupe( h, &m, NULL );
1397 x264_me_search( h, &m, mvc, 3 );
1399 m.cost += i_ref_cost;
1401 if( m.cost < l0m->cost )
1402 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1404 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1405 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1408 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1411 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1414 uint8_t **p_fenc = h->mb.pic.p_fenc;
1415 ALIGNED_4( int16_t mvc[3][2] );
1418 /* XXX Needed for x264_mb_predict_mv */
1419 h->mb.i_partition = D_8x16;
1421 for( i = 0; i < 2; i++ )
1423 x264_me_t *l0m = &a->l0.me8x16[i];
1424 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1425 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1426 const int ref8[2] = { minref, maxref };
1427 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1429 m.i_pixel = PIXEL_8x16;
1431 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1432 l0m->cost = INT_MAX;
1433 for( j = 0; j < i_ref8s; j++ )
1435 const int i_ref = ref8[j];
1436 const int i_ref_cost = REF_COST( 0, i_ref );
1437 m.i_ref_cost = i_ref_cost;
1439 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1440 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1441 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1443 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1444 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1446 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1447 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1448 /* We can only take this shortcut if the first search was performed on ref0. */
1449 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1451 /* We can just leave the MV from the previous ref search. */
1452 x264_me_refine_qpel_refdupe( h, &m, NULL );
1455 x264_me_search( h, &m, mvc, 3 );
1457 m.cost += i_ref_cost;
1459 if( m.cost < l0m->cost )
1460 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1462 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1463 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1466 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1469 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1471 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1472 uint8_t *pix2 = pix1+8;
1473 const int i_stride = h->mb.pic.i_stride[1];
1474 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1475 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1476 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1477 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1478 x264_weight_t *weight = h->sh.weight[i_ref];
1480 #define CHROMA4x4MC( width, height, me, x, y ) \
1481 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1482 if( weight[1].weightfn ) \
1483 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1484 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1485 if( weight[2].weightfn ) \
1486 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1489 if( pixel == PIXEL_4x4 )
1491 x264_me_t *m = a->l0.me4x4[i8x8];
1492 CHROMA4x4MC( 2,2, m[0], 0,0 );
1493 CHROMA4x4MC( 2,2, m[1], 2,0 );
1494 CHROMA4x4MC( 2,2, m[2], 0,2 );
1495 CHROMA4x4MC( 2,2, m[3], 2,2 );
1497 else if( pixel == PIXEL_8x4 )
1499 x264_me_t *m = a->l0.me8x4[i8x8];
1500 CHROMA4x4MC( 4,2, m[0], 0,0 );
1501 CHROMA4x4MC( 4,2, m[1], 0,2 );
1505 x264_me_t *m = a->l0.me4x8[i8x8];
1506 CHROMA4x4MC( 2,4, m[0], 0,0 );
1507 CHROMA4x4MC( 2,4, m[1], 2,0 );
1510 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1511 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1514 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1516 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1517 uint8_t **p_fenc = h->mb.pic.p_fenc;
1518 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1521 /* XXX Needed for x264_mb_predict_mv */
1522 h->mb.i_partition = D_8x8;
1524 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1526 const int idx = 4*i8x8 + i4x4;
1527 const int x4 = block_idx_x[idx];
1528 const int y4 = block_idx_y[idx];
1529 const int i_mvc = (i4x4 == 0);
1531 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1533 m->i_pixel = PIXEL_4x4;
1535 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1536 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1537 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1539 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1540 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1542 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1544 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1545 a->l0.me4x4[i8x8][1].cost +
1546 a->l0.me4x4[i8x8][2].cost +
1547 a->l0.me4x4[i8x8][3].cost +
1548 REF_COST( 0, i_ref ) +
1549 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1550 if( h->mb.b_chroma_me )
1551 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1554 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1556 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1557 uint8_t **p_fenc = h->mb.pic.p_fenc;
1558 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1561 /* XXX Needed for x264_mb_predict_mv */
1562 h->mb.i_partition = D_8x8;
1564 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1566 const int idx = 4*i8x8 + 2*i8x4;
1567 const int x4 = block_idx_x[idx];
1568 const int y4 = block_idx_y[idx];
1569 const int i_mvc = (i8x4 == 0);
1571 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1573 m->i_pixel = PIXEL_8x4;
1575 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1576 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1577 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1579 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1580 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1582 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1584 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1585 REF_COST( 0, i_ref ) +
1586 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1587 if( h->mb.b_chroma_me )
1588 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1591 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1593 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1594 uint8_t **p_fenc = h->mb.pic.p_fenc;
1595 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1598 /* XXX Needed for x264_mb_predict_mv */
1599 h->mb.i_partition = D_8x8;
1601 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1603 const int idx = 4*i8x8 + i4x8;
1604 const int x4 = block_idx_x[idx];
1605 const int y4 = block_idx_y[idx];
1606 const int i_mvc = (i4x8 == 0);
1608 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1610 m->i_pixel = PIXEL_4x8;
1612 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1613 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1614 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1616 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1617 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1619 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1621 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1622 REF_COST( 0, i_ref ) +
1623 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1624 if( h->mb.b_chroma_me )
1625 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1628 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1630 /* Assumes that fdec still contains the results of
1631 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1633 uint8_t **p_fenc = h->mb.pic.p_fenc;
1634 uint8_t **p_fdec = h->mb.pic.p_fdec;
1637 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1638 for( i = 0; i < 4; i++ )
1640 const int x = (i&1)*8;
1641 const int y = (i>>1)*8;
1642 a->i_cost16x16direct +=
1643 a->i_cost8x8direct[i] =
1644 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1647 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1651 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1653 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1654 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1655 uint8_t *src0, *src1;
1656 int stride0 = 16, stride1 = 16;
1660 ALIGNED_4( int16_t mvc[9][2] );
1661 int i_halfpel_thresh = INT_MAX;
1662 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1664 /* 16x16 Search on all ref frame */
1665 m.i_pixel = PIXEL_16x16;
1666 m.weight = weight_none;
1668 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1671 a->l0.me16x16.cost = INT_MAX;
1672 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1674 const int i_ref_cost = REF_COST( 0, i_ref );
1675 m.i_ref_cost = i_ref_cost;
1676 /* search with ref */
1677 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1678 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1679 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1680 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1683 m.cost += i_ref_cost;
1685 if( m.cost < a->l0.me16x16.cost )
1687 a->l0.i_ref = i_ref;
1688 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1691 /* save mv for predicting neighbors */
1692 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1694 a->l0.me16x16.i_ref = a->l0.i_ref;
1697 i_halfpel_thresh = INT_MAX;
1698 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1699 a->l1.me16x16.cost = INT_MAX;
1700 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1702 const int i_ref_cost = REF_COST( 0, i_ref );
1703 m.i_ref_cost = i_ref_cost;
1704 /* search with ref */
1705 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1706 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1707 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1708 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1711 m.cost += i_ref_cost;
1713 if( m.cost < a->l1.me16x16.cost )
1715 a->l1.i_ref = i_ref;
1716 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1719 /* save mv for predicting neighbors */
1720 CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1722 a->l1.me16x16.i_ref = a->l1.i_ref;
1724 /* get cost of BI mode */
1725 src0 = h->mc.get_ref( pix0, &stride0,
1726 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1727 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16, weight_none );
1728 src1 = h->mc.get_ref( pix1, &stride1,
1729 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1730 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16, weight_none );
1732 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1734 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1735 + REF_COST( 0, a->l0.i_ref )
1736 + REF_COST( 1, a->l1.i_ref )
1737 + a->l0.me16x16.cost_mv
1738 + a->l1.me16x16.cost_mv;
1741 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1742 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1743 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1746 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1748 const int x = 2*(i%2);
1749 const int y = 2*(i/2);
1751 switch( h->mb.i_sub_partition[i] )
1754 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1757 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1758 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1761 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1762 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1765 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1766 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1767 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1768 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1771 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1776 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1777 if( x264_mb_partition_listX_table[0][part] ) \
1779 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1780 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1784 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1785 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1787 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1789 if( x264_mb_partition_listX_table[1][part] ) \
1791 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1792 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1796 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1797 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1799 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1802 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1806 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1808 x264_mb_load_mv_direct8x8( h, i );
1811 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1812 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1813 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1818 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1821 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1823 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1825 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1827 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1831 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1833 uint8_t **p_fref[2] =
1834 { h->mb.pic.p_fref[0][a->l0.i_ref],
1835 h->mb.pic.p_fref[1][a->l1.i_ref] };
1836 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1839 /* XXX Needed for x264_mb_predict_mv */
1840 h->mb.i_partition = D_8x8;
1844 for( i = 0; i < 4; i++ )
1849 int i_part_cost_bi = 0;
1850 int stride[2] = {8,8};
1853 for( l = 0; l < 2; l++ )
1855 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1856 const int i_ref_cost = REF_COST( l, lX->i_ref );
1857 x264_me_t *m = &lX->me8x8[i];
1859 m->i_pixel = PIXEL_8x8;
1860 m->i_ref_cost = i_ref_cost;
1862 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1863 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1865 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
1866 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1867 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1868 m->cost += i_ref_cost;
1870 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1873 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1874 m->mv[0], m->mv[1], 8, 8, weight_none );
1875 i_part_cost_bi += m->cost_mv + i_ref_cost;
1877 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1878 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1879 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1880 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1881 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1883 i_part_cost = a->l0.me8x8[i].cost;
1884 h->mb.i_sub_partition[i] = D_L0_8x8;
1885 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1886 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1887 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1888 a->i_cost8x8bi += i_part_cost;
1890 /* XXX Needed for x264_mb_predict_mv */
1891 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1895 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1898 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1900 uint8_t **p_fref[2] =
1901 { h->mb.pic.p_fref[0][a->l0.i_ref],
1902 h->mb.pic.p_fref[1][a->l1.i_ref] };
1903 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1904 ALIGNED_4( int16_t mvc[2][2] );
1907 h->mb.i_partition = D_16x8;
1908 a->i_cost16x8bi = 0;
1910 for( i = 0; i < 2; i++ )
1913 int i_part_cost_bi = 0;
1914 int stride[2] = {16,16};
1917 /* TODO: check only the list(s) that were used in b8x8? */
1918 for( l = 0; l < 2; l++ )
1920 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1921 const int i_ref_cost = REF_COST( l, lX->i_ref );
1922 x264_me_t *m = &lX->me16x8[i];
1924 m->i_pixel = PIXEL_16x8;
1925 m->i_ref_cost = i_ref_cost;
1927 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1928 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1930 CP32( mvc[0], lX->me8x8[2*i].mv );
1931 CP32( mvc[1], lX->me8x8[2*i+1].mv );
1933 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
1934 x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
1935 x264_me_search( h, m, mvc, 2 );
1936 m->cost += i_ref_cost;
1939 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1940 m->mv[0], m->mv[1], 16, 8, weight_none );
1941 i_part_cost_bi += m->cost_mv + i_ref_cost;
1943 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1944 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1946 i_part_cost = a->l0.me16x8[i].cost;
1947 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1948 if( a->l1.me16x8[i].cost < i_part_cost )
1950 i_part_cost = a->l1.me16x8[i].cost;
1951 a->i_mb_partition16x8[i] = D_L1_8x8;
1953 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1955 i_part_cost = i_part_cost_bi;
1956 a->i_mb_partition16x8[i] = D_BI_8x8;
1958 a->i_cost16x8bi += i_part_cost;
1960 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1964 a->i_mb_type16x8 = B_L0_L0
1965 + (a->i_mb_partition16x8[0]>>2) * 3
1966 + (a->i_mb_partition16x8[1]>>2);
1967 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1970 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1972 uint8_t **p_fref[2] =
1973 { h->mb.pic.p_fref[0][a->l0.i_ref],
1974 h->mb.pic.p_fref[1][a->l1.i_ref] };
1975 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
1976 ALIGNED_4( int16_t mvc[2][2] );
1979 h->mb.i_partition = D_8x16;
1980 a->i_cost8x16bi = 0;
1982 for( i = 0; i < 2; i++ )
1985 int i_part_cost_bi = 0;
1986 int stride[2] = {8,8};
1989 for( l = 0; l < 2; l++ )
1991 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1992 const int i_ref_cost = REF_COST( l, lX->i_ref );
1993 x264_me_t *m = &lX->me8x16[i];
1995 m->i_pixel = PIXEL_8x16;
1996 m->i_ref_cost = i_ref_cost;
1998 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1999 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2001 CP32( mvc[0], lX->me8x8[i].mv );
2002 CP32( mvc[1], lX->me8x8[i+2].mv );
2004 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
2005 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2006 x264_me_search( h, m, mvc, 2 );
2007 m->cost += i_ref_cost;
2010 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2011 m->mv[0], m->mv[1], 8, 16, weight_none );
2012 i_part_cost_bi += m->cost_mv + i_ref_cost;
2015 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2016 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2018 i_part_cost = a->l0.me8x16[i].cost;
2019 a->i_mb_partition8x16[i] = D_L0_8x8;
2020 if( a->l1.me8x16[i].cost < i_part_cost )
2022 i_part_cost = a->l1.me8x16[i].cost;
2023 a->i_mb_partition8x16[i] = D_L1_8x8;
2025 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2027 i_part_cost = i_part_cost_bi;
2028 a->i_mb_partition8x16[i] = D_BI_8x8;
2030 a->i_cost8x16bi += i_part_cost;
2032 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2036 a->i_mb_type8x16 = B_L0_L0
2037 + (a->i_mb_partition8x16[0]>>2) * 3
2038 + (a->i_mb_partition8x16[1]>>2);
2039 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2042 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2044 int thresh = i_satd * 5/4;
2046 h->mb.i_type = P_L0;
2047 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2049 h->mb.i_partition = D_16x16;
2050 x264_analyse_update_cache( h, a );
2051 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2054 if( a->l0.i_cost16x8 <= thresh )
2056 h->mb.i_partition = D_16x8;
2057 x264_analyse_update_cache( h, a );
2058 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2061 a->l0.i_cost16x8 = COST_MAX;
2063 if( a->l0.i_cost8x16 <= thresh )
2065 h->mb.i_partition = D_8x16;
2066 x264_analyse_update_cache( h, a );
2067 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2070 a->l0.i_cost8x16 = COST_MAX;
2072 if( a->l0.i_cost8x8 <= thresh )
2074 h->mb.i_type = P_8x8;
2075 h->mb.i_partition = D_8x8;
2076 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2079 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2080 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2081 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2082 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2083 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2084 * for future blocks are those left over from previous RDO calls. */
2085 for( i = 0; i < 4; i++ )
2087 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2088 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2089 int subtype, btype = D_L0_8x8;
2090 uint64_t bcost = COST_MAX64;
2091 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2094 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2096 h->mb.i_sub_partition[i] = subtype;
2097 x264_mb_cache_mv_p8x8( h, a, i );
2098 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2099 COPY2_IF_LT( bcost, cost, btype, subtype );
2101 if( h->mb.i_sub_partition[i] != btype )
2103 h->mb.i_sub_partition[i] = btype;
2104 x264_mb_cache_mv_p8x8( h, a, i );
2109 x264_analyse_update_cache( h, a );
2110 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2113 a->l0.i_cost8x8 = COST_MAX;
2116 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2118 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2120 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2122 h->mb.i_type = B_DIRECT;
2123 /* Assumes direct/skip MC is still in fdec */
2124 /* Requires b-rdo to be done before intra analysis */
2125 h->mb.b_skip_mc = 1;
2126 x264_analyse_update_cache( h, a );
2127 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2128 h->mb.b_skip_mc = 0;
2131 //FIXME not all the update_cache calls are needed
2132 h->mb.i_partition = D_16x16;
2134 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2136 h->mb.i_type = B_L0_L0;
2137 x264_analyse_update_cache( h, a );
2138 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2142 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2144 h->mb.i_type = B_L1_L1;
2145 x264_analyse_update_cache( h, a );
2146 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2150 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2152 h->mb.i_type = B_BI_BI;
2153 x264_analyse_update_cache( h, a );
2154 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2158 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2160 h->mb.i_type = B_8x8;
2161 h->mb.i_partition = D_8x8;
2162 x264_analyse_update_cache( h, a );
2163 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2164 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2168 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2170 h->mb.i_type = a->i_mb_type16x8;
2171 h->mb.i_partition = D_16x8;
2172 x264_analyse_update_cache( h, a );
2173 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2177 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2179 h->mb.i_type = a->i_mb_type8x16;
2180 h->mb.i_partition = D_8x16;
2181 x264_analyse_update_cache( h, a );
2182 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2186 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2188 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2191 if( IS_INTRA(h->mb.i_type) )
2194 switch( h->mb.i_partition )
2197 if( h->mb.i_type == B_BI_BI )
2198 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2201 for( i=0; i<2; i++ )
2202 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2203 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2206 for( i=0; i<2; i++ )
2207 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2208 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2211 for( i=0; i<4; i++ )
2212 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2213 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2218 static inline void x264_mb_analyse_transform( x264_t *h )
2220 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2222 int i_cost4, i_cost8;
2223 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2226 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2227 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2228 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2229 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2231 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2232 h->mb.b_skip_mc = 1;
2236 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2238 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2241 x264_analyse_update_cache( h, a );
2242 h->mb.b_transform_8x8 ^= 1;
2243 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2244 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2246 if( *i_rd >= i_rd8 )
2249 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2253 h->mb.b_transform_8x8 ^= 1;
2257 /* Rate-distortion optimal QP selection.
2258 * FIXME: More than half of the benefit of this function seems to be
2259 * in the way it improves the coding of chroma DC (by decimating or
2260 * finding a better way to code a single DC coefficient.)
2261 * There must be a more efficient way to get that portion of the benefit
2262 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2264 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2266 int bcost, cost, direction, failures, prevcost, origcost;
2267 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2268 int last_qp_tried = 0;
2269 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2271 /* If CBP is already zero, don't raise the quantizer any higher. */
2272 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2274 /* Without psy-RD, require monotonicity when moving quant away from previous
2275 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2276 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2277 * allow 2 failures when moving quant towards previous quant.
2278 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2279 int threshold = (!!h->mb.i_psy_rd);
2280 /* Raise the threshold for failures if we're moving towards the last QP. */
2281 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2282 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2284 h->mb.i_qp = orig_qp;
2286 prevcost = origcost;
2287 h->mb.i_qp += direction;
2288 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2290 if( h->mb.i_last_qp == h->mb.i_qp )
2292 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2293 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2294 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2296 /* We can't assume that the costs are monotonic over QPs.
2297 * Tie case-as-failure seems to give better results. */
2298 if( cost < prevcost )
2304 if( failures > threshold )
2306 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2308 h->mb.i_qp += direction;
2312 /* Always try the last block's QP. */
2313 if( !last_qp_tried )
2315 h->mb.i_qp = h->mb.i_last_qp;
2316 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2317 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2318 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2322 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2324 /* Check transform again; decision from before may no longer be optimal. */
2325 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2326 x264_mb_transform_8x8_allowed( h ) )
2328 h->mb.b_transform_8x8 ^= 1;
2329 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2331 h->mb.b_transform_8x8 ^= 1;
2335 /*****************************************************************************
2336 * x264_macroblock_analyse:
2337 *****************************************************************************/
2338 void x264_macroblock_analyse( x264_t *h )
2340 x264_mb_analysis_t analysis;
2341 int i_cost = COST_MAX;
2344 h->mb.i_qp = x264_ratecontrol_qp( h );
2345 if( h->param.rc.i_aq_mode )
2347 x264_adaptive_quant( h );
2348 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2349 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2350 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2351 h->mb.i_qp = h->mb.i_last_qp;
2354 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2356 /*--------------------------- Do the analysis ---------------------------*/
2357 if( h->sh.i_type == SLICE_TYPE_I )
2360 if( analysis.i_mbrd )
2361 x264_mb_cache_fenc_satd( h );
2362 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2363 if( analysis.i_mbrd )
2364 x264_intra_rd( h, &analysis, COST_MAX );
2366 i_cost = analysis.i_satd_i16x16;
2367 h->mb.i_type = I_16x16;
2368 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2369 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2370 if( analysis.i_satd_pcm < i_cost )
2371 h->mb.i_type = I_PCM;
2373 else if( analysis.i_mbrd >= 2 )
2374 x264_intra_rd_refine( h, &analysis );
2376 else if( h->sh.i_type == SLICE_TYPE_P )
2380 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2382 analysis.b_try_pskip = 0;
2383 if( analysis.b_force_intra )
2385 if( !h->param.analyse.b_psy )
2387 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2388 goto intra_analysis;
2393 /* Fast P_SKIP detection */
2394 if( h->param.analyse.b_fast_pskip )
2396 if( h->param.i_threads > 1 && !h->param.b_sliced_threads && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2397 // FIXME don't need to check this if the reference frame is done
2399 else if( h->param.analyse.i_subpel_refine >= 3 )
2400 analysis.b_try_pskip = 1;
2401 else if( h->mb.i_mb_type_left == P_SKIP ||
2402 h->mb.i_mb_type_top == P_SKIP ||
2403 h->mb.i_mb_type_topleft == P_SKIP ||
2404 h->mb.i_mb_type_topright == P_SKIP )
2405 b_skip = x264_macroblock_probe_pskip( h );
2409 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2413 h->mb.i_type = P_SKIP;
2414 h->mb.i_partition = D_16x16;
2415 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 || h->param.b_sliced_threads );
2419 const unsigned int flags = h->param.analyse.inter;
2423 int i_satd_inter, i_satd_intra;
2425 x264_mb_analyse_load_costs( h, &analysis );
2427 x264_mb_analyse_inter_p16x16( h, &analysis );
2429 if( h->mb.i_type == P_SKIP )
2432 if( flags & X264_ANALYSE_PSUB16x16 )
2434 if( h->param.analyse.b_mixed_references )
2435 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2437 x264_mb_analyse_inter_p8x8( h, &analysis );
2440 /* Select best inter mode */
2442 i_partition = D_16x16;
2443 i_cost = analysis.l0.me16x16.cost;
2445 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2446 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2449 i_partition = D_8x8;
2450 i_cost = analysis.l0.i_cost8x8;
2453 if( flags & X264_ANALYSE_PSUB8x8 )
2455 for( i = 0; i < 4; i++ )
2457 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2458 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2460 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2461 h->mb.i_sub_partition[i] = D_L0_4x4;
2463 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2464 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2465 h->mb.i_sub_partition[i], D_L0_8x4 );
2467 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2468 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2469 h->mb.i_sub_partition[i], D_L0_4x8 );
2471 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2473 x264_mb_cache_mv_p8x8( h, &analysis, i );
2475 analysis.l0.i_cost8x8 = i_cost;
2479 /* Now do 16x8/8x16 */
2480 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2481 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2482 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2484 x264_mb_analyse_inter_p16x8( h, &analysis );
2485 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2487 x264_mb_analyse_inter_p8x16( h, &analysis );
2488 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2491 h->mb.i_partition = i_partition;
2494 //FIXME mb_type costs?
2495 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2499 else if( i_partition == D_16x16 )
2501 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2502 i_cost = analysis.l0.me16x16.cost;
2504 else if( i_partition == D_16x8 )
2506 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2507 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2508 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2510 else if( i_partition == D_8x16 )
2512 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2513 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2514 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2516 else if( i_partition == D_8x8 )
2520 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2522 switch( h->mb.i_sub_partition[i8x8] )
2525 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2526 i_cost += analysis.l0.me8x8[i8x8].cost;
2529 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2530 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2531 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2532 analysis.l0.me8x4[i8x8][1].cost;
2535 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2536 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2537 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2538 analysis.l0.me4x8[i8x8][1].cost;
2542 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2543 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2544 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2545 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2546 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2547 analysis.l0.me4x4[i8x8][1].cost +
2548 analysis.l0.me4x4[i8x8][2].cost +
2549 analysis.l0.me4x4[i8x8][3].cost;
2552 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2558 if( h->mb.b_chroma_me )
2560 x264_mb_analyse_intra_chroma( h, &analysis );
2561 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2562 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2563 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2564 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2567 x264_mb_analyse_intra( h, &analysis, i_cost );
2569 i_satd_inter = i_cost;
2570 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2571 analysis.i_satd_i8x8,
2572 analysis.i_satd_i4x4 );
2574 if( analysis.i_mbrd )
2576 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2578 i_partition = D_16x16;
2579 i_cost = analysis.l0.i_rd16x16;
2580 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2581 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2582 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2583 h->mb.i_type = i_type;
2584 h->mb.i_partition = i_partition;
2585 if( i_cost < COST_MAX )
2586 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2587 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2590 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2591 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2592 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2593 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2595 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2597 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2598 * it was an inter block. */
2599 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2600 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2601 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2602 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2603 goto intra_analysis;
2606 h->mb.i_type = i_type;
2608 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2610 if( IS_INTRA( h->mb.i_type ) )
2612 x264_intra_rd_refine( h, &analysis );
2614 else if( i_partition == D_16x16 )
2616 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2617 analysis.l0.me16x16.cost = i_cost;
2618 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2620 else if( i_partition == D_16x8 )
2622 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2623 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2624 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2625 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2626 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2627 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2629 else if( i_partition == D_8x16 )
2631 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2632 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2633 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2634 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2635 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2636 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2638 else if( i_partition == D_8x8 )
2641 x264_analyse_update_cache( h, &analysis );
2642 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2644 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2646 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2648 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2650 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2651 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2653 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2655 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2656 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2658 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2660 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2661 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2662 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2663 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2670 else if( h->sh.i_type == SLICE_TYPE_B )
2672 int i_bskip_cost = COST_MAX;
2675 if( analysis.i_mbrd )
2676 x264_mb_cache_fenc_satd( h );
2678 h->mb.i_type = B_SKIP;
2679 if( h->mb.b_direct_auto_write )
2681 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2682 for( i = 0; i < 2; i++ )
2685 h->sh.b_direct_spatial_mv_pred ^= 1;
2686 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2687 if( analysis.b_direct_available )
2692 b_skip = x264_macroblock_probe_bskip( h );
2694 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2701 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2703 if( analysis.b_direct_available )
2705 if( !h->mb.b_direct_auto_write )
2707 if( analysis.i_mbrd )
2709 i_bskip_cost = ssd_mb( h );
2710 /* 6 = minimum cavlc cost of a non-skipped MB */
2711 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2713 else if( !h->mb.b_direct_auto_write )
2715 /* Conditioning the probe on neighboring block types
2716 * doesn't seem to help speed or quality. */
2717 b_skip = x264_macroblock_probe_bskip( h );
2723 const unsigned int flags = h->param.analyse.inter;
2727 h->mb.b_skip_mc = 0;
2729 x264_mb_analyse_load_costs( h, &analysis );
2731 /* select best inter mode */
2732 /* direct must be first */
2733 if( analysis.b_direct_available )
2734 x264_mb_analyse_inter_direct( h, &analysis );
2736 x264_mb_analyse_inter_b16x16( h, &analysis );
2739 i_partition = D_16x16;
2740 i_cost = analysis.l0.me16x16.cost;
2741 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2742 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2743 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2745 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2747 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2748 if( i_bskip_cost < analysis.i_rd16x16direct &&
2749 i_bskip_cost < analysis.i_rd16x16bi &&
2750 i_bskip_cost < analysis.l0.i_rd16x16 &&
2751 i_bskip_cost < analysis.l1.i_rd16x16 )
2753 h->mb.i_type = B_SKIP;
2754 x264_analyse_update_cache( h, &analysis );
2759 if( flags & X264_ANALYSE_BSUB16x16 )
2761 x264_mb_analyse_inter_b8x8( h, &analysis );
2762 if( analysis.i_cost8x8bi < i_cost )
2765 i_partition = D_8x8;
2766 i_cost = analysis.i_cost8x8bi;
2768 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2769 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2771 x264_mb_analyse_inter_b16x8( h, &analysis );
2772 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2773 i_type, analysis.i_mb_type16x8,
2774 i_partition, D_16x8 );
2776 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2777 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2779 x264_mb_analyse_inter_b8x16( h, &analysis );
2780 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2781 i_type, analysis.i_mb_type8x16,
2782 i_partition, D_8x16 );
2787 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2792 else if( i_partition == D_16x16 )
2794 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2795 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2796 if( i_type == B_L0_L0 )
2798 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2799 i_cost = analysis.l0.me16x16.cost
2800 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2802 else if( i_type == B_L1_L1 )
2804 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2805 i_cost = analysis.l1.me16x16.cost
2806 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2808 else if( i_type == B_BI_BI )
2810 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2811 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2814 else if( i_partition == D_16x8 )
2816 for( i=0; i<2; i++ )
2818 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2819 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2820 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2821 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2824 else if( i_partition == D_8x16 )
2826 for( i=0; i<2; i++ )
2828 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2829 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2830 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2831 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2834 else if( i_partition == D_8x8 )
2836 for( i=0; i<4; i++ )
2839 int i_part_cost_old;
2841 int i_part_type = h->mb.i_sub_partition[i];
2842 int b_bidir = (i_part_type == D_BI_8x8);
2844 if( i_part_type == D_DIRECT_8x8 )
2846 if( x264_mb_partition_listX_table[0][i_part_type] )
2848 m = &analysis.l0.me8x8[i];
2849 i_part_cost_old = m->cost;
2850 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2851 m->cost -= i_type_cost;
2852 x264_me_refine_qpel( h, m );
2854 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2856 if( x264_mb_partition_listX_table[1][i_part_type] )
2858 m = &analysis.l1.me8x8[i];
2859 i_part_cost_old = m->cost;
2860 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2861 m->cost -= i_type_cost;
2862 x264_me_refine_qpel( h, m );
2864 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2866 /* TODO: update mvp? */
2870 i_satd_inter = i_cost;
2872 if( analysis.i_mbrd )
2874 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2876 i_cost = i_bskip_cost;
2877 i_partition = D_16x16;
2878 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2879 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2880 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2881 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2882 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2883 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2884 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2886 h->mb.i_type = i_type;
2887 h->mb.i_partition = i_partition;
2890 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2892 if( analysis.i_mbrd )
2894 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2895 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2898 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2899 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2900 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2901 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2903 h->mb.i_type = i_type;
2904 h->mb.i_partition = i_partition;
2906 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2907 x264_intra_rd_refine( h, &analysis );
2908 if( h->mb.i_subpel_refine >= 5 )
2909 x264_refine_bidir( h, &analysis );
2911 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2913 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2914 x264_analyse_update_cache( h, &analysis );
2916 if( i_partition == D_16x16 )
2918 if( i_type == B_L0_L0 )
2920 analysis.l0.me16x16.cost = i_cost;
2921 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2923 else if( i_type == B_L1_L1 )
2925 analysis.l1.me16x16.cost = i_cost;
2926 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2928 else if( i_type == B_BI_BI )
2929 x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2931 else if( i_partition == D_16x8 )
2933 for( i = 0; i < 2; i++ )
2935 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2936 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2937 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2938 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2939 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2940 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2941 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2944 else if( i_partition == D_8x16 )
2946 for( i = 0; i < 2; i++ )
2948 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2949 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2950 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2951 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2952 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2953 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2954 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2957 else if( i_partition == D_8x8 )
2959 for( i = 0; i < 4; i++ )
2961 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2962 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2963 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2964 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2965 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2966 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2973 x264_analyse_update_cache( h, &analysis );
2975 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
2976 * without realizing it. Check for this and account for it if necessary. */
2977 if( analysis.i_mbrd >= 2 )
2979 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
2980 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
2981 int list = check_mv_lists[h->mb.i_type] - 1;
2982 if( list >= 0 && h->mb.i_partition != D_16x16 &&
2983 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
2984 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
2985 h->mb.i_partition = D_16x16;
2988 if( !analysis.i_mbrd )
2989 x264_mb_analyse_transform( h );
2991 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2992 x264_mb_analyse_qp_rd( h, &analysis );
2994 h->mb.b_trellis = h->param.analyse.i_trellis;
2995 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2996 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2997 x264_psy_trellis_init( h, 0 );
2998 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2999 h->mb.i_skip_intra = 0;
3002 /*-------------------- Update MB from the analysis ----------------------*/
3003 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3007 switch( h->mb.i_type )
3010 for( i = 0; i < 16; i++ )
3011 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3013 x264_mb_analyse_intra_chroma( h, a );
3016 for( i = 0; i < 4; i++ )
3017 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3019 x264_mb_analyse_intra_chroma( h, a );
3022 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3023 x264_mb_analyse_intra_chroma( h, a );
3030 switch( h->mb.i_partition )
3033 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3034 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3038 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3039 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3040 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3041 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3045 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3046 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3047 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3048 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3052 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3058 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3059 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3060 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3061 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3062 for( i = 0; i < 4; i++ )
3063 x264_mb_cache_mv_p8x8( h, a, i );
3068 h->mb.i_partition = D_16x16;
3069 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3070 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3076 x264_mb_load_mv_direct8x8( h, 0 );
3077 x264_mb_load_mv_direct8x8( h, 1 );
3078 x264_mb_load_mv_direct8x8( h, 2 );
3079 x264_mb_load_mv_direct8x8( h, 3 );
3083 /* optimize: cache might not need to be rewritten */
3084 for( i = 0; i < 4; i++ )
3085 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3088 default: /* the rest of the B types */
3089 switch( h->mb.i_partition )
3092 switch( h->mb.i_type )
3095 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3096 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3098 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3099 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3100 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3103 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3104 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3105 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3107 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3108 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3111 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3112 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3114 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3115 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3120 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3121 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3124 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3125 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3128 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3134 if( h->param.i_threads > 1 && !h->param.b_sliced_threads && !IS_INTRA(h->mb.i_type) )
3137 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3140 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3143 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3144 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3146 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3147 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3148 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3149 h->mb.cache.mv[l][x264_scan8[15]][0],
3150 h->mb.cache.mv[l][x264_scan8[15]][1] );
3151 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3152 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3153 fprintf(stderr, "completed: %d \n", completed );
3154 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3155 x264_mb_analyse_intra( h, a, COST_MAX );
3156 h->mb.i_type = I_16x16;
3157 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3158 x264_mb_analyse_intra_chroma( h, a );
3165 #include "slicetype.c"