1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
43 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
47 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
48 ALIGNED_4( int16_t mvc[32][5][2] );
52 int i_cost4x4[4]; /* cost per 8x8 partition */
53 x264_me_t me4x4[4][4];
56 int i_cost8x4[4]; /* cost per 8x8 partition */
57 x264_me_t me8x4[4][2];
60 int i_cost4x8[4]; /* cost per 8x8 partition */
61 x264_me_t me4x8[4][2];
71 } x264_mb_analysis_list_t;
75 /* conduct the analysis using this lamda and QP */
80 uint16_t *p_cost_ref[2];
85 /* Take some shortcuts in intra search if intra is deemed unlikely */
87 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
92 int i_satd_i16x16_dir[7];
97 int i_satd_i8x8_dir[12][4];
101 int i_predict4x4[16];
106 int i_satd_i8x8chroma;
107 int i_satd_i8x8chroma_dir[7];
108 int i_predict8x8chroma;
110 /* II: Inter part P/B frame */
111 x264_mb_analysis_list_t l0;
112 x264_mb_analysis_list_t l1;
114 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115 int i_cost16x16direct;
117 int i_cost8x8direct[4];
126 int i_mb_partition16x8[2]; /* mb_partition_e */
127 int i_mb_partition8x16[2];
128 int i_mb_type16x8; /* mb_class_e */
131 int b_direct_available;
133 } x264_mb_analysis_t;
135 /* lambda = pow(2,qp/6-2) */
136 const uint8_t x264_lambda_tab[52] = {
137 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
138 1, 1, 1, 1, /* 8-11 */
139 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
140 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
141 6, 7, 8, 9,10,11,13,14, /* 28-35 */
142 16,18,20,23,25,29,32,36, /* 36-43 */
143 40,45,51,57,64,72,81,91 /* 44-51 */
146 /* lambda2 = pow(lambda,2) * .9 * 256 */
147 const int x264_lambda2_tab[52] = {
148 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
149 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
150 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
151 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
152 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
153 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
154 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
157 const uint8_t x264_exp2_lut[64] = {
158 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
159 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
160 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
161 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
164 const float x264_log2_lut[128] = {
165 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
166 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
167 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
168 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
169 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
170 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
171 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
172 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
173 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
174 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
175 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
176 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
177 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
178 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
179 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
180 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
183 /* Avoid an int/float conversion. */
184 const float x264_log2_lz_lut[32] = {
185 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
188 // should the intra and inter lambdas be different?
189 // I'm just matching the behaviour of deadzone quant.
190 static const int x264_trellis_lambda2_tab[2][52] = {
191 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
192 { 46, 58, 73, 92, 117, 147,
193 185, 233, 294, 370, 466, 587,
194 740, 932, 1174, 1480, 1864, 2349,
195 2959, 3728, 4697, 5918, 7457, 9395,
196 11837, 14914, 18790, 23674, 29828, 37581,
197 47349, 59656, 75163, 94699, 119313, 150326,
198 189399, 238627, 300652, 378798, 477255, 601304,
199 757596, 954511, 1202608, 1515192, 1909022, 2405217,
200 3030384, 3818045, 4810435, 6060769 },
201 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
202 { 27, 34, 43, 54, 68, 86,
203 108, 136, 172, 216, 273, 343,
204 433, 545, 687, 865, 1090, 1374,
205 1731, 2180, 2747, 3461, 4361, 5494,
206 6922, 8721, 10988, 13844, 17442, 21976,
207 27688, 34885, 43953, 55377, 69771, 87906,
208 110755, 139543, 175813, 221511, 279087, 351627,
209 443023, 558174, 703255, 886046, 1116348, 1406511,
210 1772093, 2232697, 2813022, 3544186 }
213 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
214 16, 20, 25, 32, 40, 50,
215 64, 80, 101, 128, 161, 203,
216 256, 322, 406, 512, 645, 812,
217 1024, 1290, 1625, 2048, 2580, 3250,
218 4096, 5160, 6501, 8192, 10321, 13003,
219 16384, 20642, 26007, 32768, 41285, 52015,
223 /* TODO: calculate CABAC costs */
224 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
225 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
227 static const uint8_t i_mb_b16x8_cost_table[17] = {
228 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
230 static const uint8_t i_sub_mb_b_cost_table[13] = {
231 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
233 static const uint8_t i_sub_mb_p_cost_table[4] = {
237 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
239 static uint16_t x264_cost_ref[92][3][33];
240 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
242 int x264_analyse_init_costs( x264_t *h, int qp )
245 int lambda = x264_lambda_tab[qp];
246 if( h->cost_mv[lambda] )
248 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
249 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
250 h->cost_mv[lambda] += 2*4*2048;
251 for( i = 0; i <= 2*4*2048; i++ )
253 h->cost_mv[lambda][-i] =
254 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
256 x264_pthread_mutex_lock( &cost_ref_mutex );
257 for( i = 0; i < 3; i++ )
258 for( j = 0; j < 33; j++ )
259 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
260 x264_pthread_mutex_unlock( &cost_ref_mutex );
261 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
265 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
266 h->cost_mv_fpel[lambda][j] += 2*2048;
267 for( i = -2*2048; i < 2*2048; i++ )
268 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
276 void x264_analyse_free_costs( x264_t *h )
279 for( i = 0; i < 92; i++ )
282 x264_free( h->cost_mv[i] - 2*4*2048 );
283 if( h->cost_mv_fpel[i][0] )
284 for( j = 0; j < 4; j++ )
285 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
289 void x264_analyse_weight_frame( x264_t *h, int end )
292 for( j=0; j<h->i_ref0; j++ )
294 if( h->sh.weight[j][0].weightfn )
296 x264_frame_t *frame = h->fref0[j];
297 int width = frame->i_width[0] + 2*PADH;
298 int i_padv = PADV << h->param.b_interlaced;
300 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
302 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
303 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
304 h->fenc->i_lines_weighted += height;
307 for( k = j; k < h->i_ref0; k++ )
308 if( h->sh.weight[k][0].weightfn )
310 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
311 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
312 src + offset, frame->i_stride[0],
313 width, height, &h->sh.weight[k][0] );
321 /* initialize an array of lambda*nbits for all possible mvs */
322 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
324 a->p_cost_mv = h->cost_mv[a->i_lambda];
325 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
326 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
329 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
331 /* conduct the analysis using this lamda and QP */
332 a->i_qp = h->mb.i_qp = i_qp;
333 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
335 a->i_lambda = x264_lambda_tab[i_qp];
336 a->i_lambda2 = x264_lambda2_tab[i_qp];
338 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
339 if( h->param.analyse.i_trellis )
341 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
342 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
343 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
344 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
346 h->mb.i_psy_rd_lambda = a->i_lambda;
347 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
348 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
352 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
354 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
356 /* mbrd == 1 -> RD mode decision */
357 /* mbrd == 2 -> RD refinement */
358 /* mbrd == 3 -> QPRD */
359 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
361 x264_mb_analyse_init_qp( h, a, i_qp );
363 h->mb.i_me_method = h->param.analyse.i_me_method;
364 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
365 if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
366 h->mb.i_subpel_refine--;
367 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
368 && h->mb.i_subpel_refine >= 5;
369 h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
370 (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
372 h->mb.b_transform_8x8 = 0;
373 h->mb.b_noise_reduction = 0;
379 a->i_satd_i8x8chroma = COST_MAX;
381 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
382 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
386 h->mb.b_lossless ? 0 :
388 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
390 /* II: Inter part P/B frame */
391 if( h->sh.i_type != SLICE_TYPE_I )
394 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
395 // limit motion search to a slightly smaller range than the theoretical limit,
396 // since the search may go a few iterations past its given range
397 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
399 /* Calculate max allowed MV range */
400 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
401 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
402 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
403 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
404 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
405 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
407 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
408 int max_mv = max_x - 4*16*h->mb.i_mb_x;
409 /* If we're left of the refresh bar, don't reference right of it. */
410 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
411 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
413 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
414 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
415 if( h->mb.i_mb_x == 0 )
417 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
418 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
419 int thread_mvy_range = i_fmv_range;
421 if( h->i_thread_frames > 1 )
423 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
424 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
425 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
427 x264_frame_t **fref = i ? h->fref1 : h->fref0;
428 int i_ref = i ? h->i_ref1 : h->i_ref0;
429 for( j=0; j<i_ref; j++ )
431 x264_frame_cond_wait( fref[j]->orig, thresh );
432 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
436 if( h->param.b_deterministic )
437 thread_mvy_range = h->param.analyse.i_mv_range_thread;
438 if( h->mb.b_interlaced )
439 thread_mvy_range >>= 1;
441 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
444 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
445 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
446 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
447 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
448 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
449 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
450 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
456 a->l0.i_cost8x8 = COST_MAX;
458 for( i = 0; i < 4; i++ )
462 a->l0.i_cost4x8[i] = COST_MAX;
466 a->l0.i_cost8x16 = COST_MAX;
467 if( h->sh.i_type == SLICE_TYPE_B )
471 a->l1.i_cost8x8 = COST_MAX;
473 for( i = 0; i < 4; i++ )
478 a->i_cost8x8direct[i] = COST_MAX;
489 a->i_cost16x16direct =
492 a->i_cost8x16bi = COST_MAX;
495 /* Fast intra decision */
496 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
498 if( IS_INTRA( h->mb.i_mb_type_left )
499 || IS_INTRA( h->mb.i_mb_type_top )
500 || IS_INTRA( h->mb.i_mb_type_topleft )
501 || IS_INTRA( h->mb.i_mb_type_topright )
502 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
503 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
504 { /* intra is likely */ }
511 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
512 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
514 a->b_force_intra = 1;
518 a->b_force_intra = 0;
522 /* Prediction modes allowed for various combinations of neighbors. */
523 /* Terminated by a -1. */
524 /* In order, no neighbors, left, top, top/left, top/left/topleft */
525 static const int8_t i16x16_mode_available[5][5] =
527 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
528 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
529 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
530 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
531 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
534 static const int8_t i8x8chroma_mode_available[5][5] =
536 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
537 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
538 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
539 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
540 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
543 static const int8_t i4x4_mode_available[5][10] =
545 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
546 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
547 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
548 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
549 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
552 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
554 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
555 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
558 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
560 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
561 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
564 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
566 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
567 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
570 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
571 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
573 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
575 if( do_both_dct || h->mb.b_transform_8x8 )
576 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
577 if( do_both_dct || !h->mb.b_transform_8x8 )
578 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
581 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
582 static inline void x264_mb_cache_fenc_satd( x264_t *h )
584 ALIGNED_16( static uint8_t zero[16] ) = {0};
586 int x, y, satd_sum = 0, sa8d_sum = 0;
587 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
588 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
589 if( !h->mb.i_psy_rd )
591 for( y = 0; y < 4; y++ )
592 for( x = 0; x < 4; x++ )
594 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
595 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
596 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
597 satd_sum += h->mb.pic.fenc_satd[y][x];
599 for( y = 0; y < 2; y++ )
600 for( x = 0; x < 2; x++ )
602 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
603 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
604 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
605 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
607 h->mb.pic.fenc_satd_sum = satd_sum;
608 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
611 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
613 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
615 if( a->i_satd_i8x8chroma < COST_MAX )
618 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
620 /* 8x8 prediction selection for chroma */
621 if( predict_mode[3] >= 0 && b_merged_satd )
623 int satdu[4], satdv[4];
624 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
625 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
626 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
627 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
628 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
629 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
631 for( ; *predict_mode >= 0; predict_mode++ )
633 int i_mode = *predict_mode;
634 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
636 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
637 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
642 for( ; *predict_mode >= 0; predict_mode++ )
645 int i_mode = *predict_mode;
647 /* we do the prediction */
648 if( h->mb.b_lossless )
649 x264_predict_lossless_8x8_chroma( h, i_mode );
652 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
653 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
656 /* we calculate the cost */
657 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
658 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
659 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
661 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
662 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
666 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
669 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
671 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
672 uint8_t *p_src = h->mb.pic.p_fenc[0];
673 uint8_t *p_dst = h->mb.pic.p_fdec[0];
676 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
678 /*---------------- Try all mode and calculate their score ---------------*/
680 /* 16x16 prediction selection */
681 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
683 if( b_merged_satd && predict_mode[3] >= 0 )
685 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
686 h->predict_16x16[I_PRED_16x16_P]( p_dst );
687 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
688 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
691 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
692 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
697 for( ; *predict_mode >= 0; predict_mode++ )
700 int i_mode = *predict_mode;
702 if( h->mb.b_lossless )
703 x264_predict_lossless_16x16( h, i_mode );
705 h->predict_16x16[i_mode]( p_dst );
707 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
708 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
709 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
710 a->i_satd_i16x16_dir[i_mode] = i_satd;
714 if( h->sh.i_type == SLICE_TYPE_B )
715 /* cavlc mb type prefix */
716 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
717 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
720 /* 8x8 prediction selection */
721 if( flags & X264_ANALYSE_I8x8 )
723 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
724 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
725 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
727 h->mb.i_cbp_luma = 0;
728 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
730 // FIXME some bias like in i4x4?
731 if( h->sh.i_type == SLICE_TYPE_B )
732 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
734 for( idx = 0;; idx++ )
738 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
739 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
740 int i_best = COST_MAX;
741 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
743 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
744 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
746 if( b_merged_satd && predict_mode[8] >= 0 )
749 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
750 satd[i_pred_mode] -= 3 * a->i_lambda;
751 for( i=2; i>=0; i-- )
753 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
754 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
759 for( ; *predict_mode >= 0; predict_mode++ )
762 int i_mode = *predict_mode;
764 if( h->mb.b_lossless )
765 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
767 h->predict_8x8[i_mode]( p_dst_by, edge );
769 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
770 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
771 i_satd -= a->i_lambda * 3;
773 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
774 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
778 if( idx == 3 || i_cost > i_satd_thresh )
781 /* we need to encode this block now (for next ones) */
782 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
783 x264_mb_encode_i8x8( h, idx, a->i_qp );
785 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
790 a->i_satd_i8x8 = i_cost;
791 if( h->mb.i_skip_intra )
793 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
794 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
795 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
796 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
797 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
798 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
799 if( h->mb.i_skip_intra == 2 )
800 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
805 static const uint16_t cost_div_fix8[3] = {1024,512,341};
806 a->i_satd_i8x8 = COST_MAX;
807 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
809 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
813 /* 4x4 prediction selection */
814 if( flags & X264_ANALYSE_I4x4 )
817 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
818 h->mb.i_cbp_luma = 0;
819 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
821 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
823 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
824 if( h->sh.i_type == SLICE_TYPE_B )
825 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
827 for( idx = 0;; idx++ )
829 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
830 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
831 int i_best = COST_MAX;
832 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
834 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
836 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
837 /* emulate missing topright samples */
838 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
840 if( b_merged_satd && predict_mode[5] >= 0 )
843 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
844 satd[i_pred_mode] -= 3 * a->i_lambda;
845 for( i=2; i>=0; i-- )
846 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
850 for( ; *predict_mode >= 0; predict_mode++ )
853 int i_mode = *predict_mode;
855 if( h->mb.b_lossless )
856 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
858 h->predict_4x4[i_mode]( p_dst_by );
860 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
861 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
862 i_satd -= a->i_lambda * 3;
864 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
866 i_cost += i_best + 4 * a->i_lambda;
868 if( i_cost > i_satd_thresh || idx == 15 )
871 /* we need to encode this block now (for next ones) */
872 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
873 x264_mb_encode_i4x4( h, idx, a->i_qp );
875 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
879 a->i_satd_i4x4 = i_cost;
880 if( h->mb.i_skip_intra )
882 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
883 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
884 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
885 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
886 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
887 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
888 if( h->mb.i_skip_intra == 2 )
889 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
893 a->i_satd_i4x4 = COST_MAX;
897 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
899 if( a->i_satd_i16x16 <= i_satd_thresh )
901 h->mb.i_type = I_16x16;
902 x264_analyse_update_cache( h, a );
903 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
906 a->i_satd_i16x16 = COST_MAX;
908 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
910 h->mb.i_type = I_4x4;
911 x264_analyse_update_cache( h, a );
912 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
915 a->i_satd_i4x4 = COST_MAX;
917 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
919 h->mb.i_type = I_8x8;
920 x264_analyse_update_cache( h, a );
921 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
922 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
925 a->i_satd_i8x8 = COST_MAX;
928 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
930 uint8_t *p_dst = h->mb.pic.p_fdec[0];
933 int i_mode, i_thresh;
934 uint64_t i_satd, i_best;
935 h->mb.i_skip_intra = 0;
937 if( h->mb.i_type == I_16x16 )
939 int old_pred_mode = a->i_predict16x16;
940 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
941 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
942 i_best = a->i_satd_i16x16;
943 for( ; *predict_mode >= 0; predict_mode++ )
945 int i_mode = *predict_mode;
946 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
948 h->mb.i_intra16x16_pred_mode = i_mode;
949 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
950 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
954 /* RD selection for chroma prediction */
955 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
956 if( predict_mode[1] >= 0 )
958 int8_t predict_mode_sorted[4];
960 i_thresh = a->i_satd_i8x8chroma * 5/4;
962 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
964 i_mode = *predict_mode;
965 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
966 predict_mode_sorted[i_max++] = i_mode;
971 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
972 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
973 /* the previous thing encoded was x264_intra_rd(), so the pixels and
974 * coefs for the current chroma mode are still around, so we only
975 * have to recount the bits. */
976 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
977 for( i = 0; i < i_max; i++ )
979 i_mode = predict_mode_sorted[i];
980 if( h->mb.b_lossless )
981 x264_predict_lossless_8x8_chroma( h, i_mode );
984 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
985 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
987 /* if we've already found a mode that needs no residual, then
988 * probably any mode with a residual will be worse.
989 * so avoid dct on the remaining modes to improve speed. */
990 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
991 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
993 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
994 h->mb.i_cbp_chroma = i_cbp_chroma_best;
998 if( h->mb.i_type == I_4x4 )
1000 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1002 for( idx = 0; idx < 16; idx++ )
1004 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1005 i_best = COST_MAX64;
1007 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
1009 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1010 /* emulate missing topright samples */
1011 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1013 for( ; *predict_mode >= 0; predict_mode++ )
1015 i_mode = *predict_mode;
1016 if( h->mb.b_lossless )
1017 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1019 h->predict_4x4[i_mode]( p_dst_by );
1020 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1022 if( i_best > i_satd )
1024 a->i_predict4x4[idx] = i_mode;
1026 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1027 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1028 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1029 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1030 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1034 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1035 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1036 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1037 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1038 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1040 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1043 else if( h->mb.i_type == I_8x8 )
1045 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1046 for( idx = 0; idx < 4; idx++ )
1048 uint64_t pels_h = 0;
1050 uint16_t i_nnz[2] = {0}; //shut up gcc
1053 int cbp_luma_new = 0;
1054 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1056 i_best = COST_MAX64;
1060 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1061 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1062 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1064 for( ; *predict_mode >= 0; predict_mode++ )
1066 i_mode = *predict_mode;
1067 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1070 if( h->mb.b_lossless )
1071 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1073 h->predict_8x8[i_mode]( p_dst_by, edge );
1074 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1075 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1077 if( i_best > i_satd )
1079 a->i_predict8x8[idx] = i_mode;
1080 cbp_luma_new = h->mb.i_cbp_luma;
1083 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1085 for( j=0; j<7; j++ )
1086 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1087 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1088 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1091 a->i_cbp_i8x8_luma = cbp_luma_new;
1092 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1094 for( j=0; j<7; j++ )
1095 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1096 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1097 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1099 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1104 #define LOAD_FENC( m, src, xoff, yoff) \
1105 (m)->p_cost_mv = a->p_cost_mv; \
1106 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1107 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1108 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1109 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1110 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1112 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1113 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1114 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1115 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1116 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1117 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1118 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1119 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1120 (m)->weight = weight_none; \
1123 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1124 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1125 (m)->weight = h->sh.weight[i_ref];
1127 #define REF_COST(list, ref) \
1128 (a->p_cost_ref[list][ref])
1130 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1134 ALIGNED_4( int16_t mvc[8][2] );
1135 int i_halfpel_thresh = INT_MAX;
1136 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1138 /* 16x16 Search on all ref frame */
1139 m.i_pixel = PIXEL_16x16;
1140 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1142 a->l0.me16x16.cost = INT_MAX;
1143 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1145 const int i_ref_cost = REF_COST( 0, i_ref );
1146 i_halfpel_thresh -= i_ref_cost;
1147 m.i_ref_cost = i_ref_cost;
1149 /* search with ref */
1150 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1151 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1153 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1154 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1156 if( h->mb.ref_blind_dupe == i_ref )
1158 CP32( m.mv, a->l0.mvc[0][0] );
1159 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1162 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1164 /* early termination
1165 * SSD threshold would probably be better than SATD */
1168 && m.cost-m.cost_mv < 300*a->i_lambda
1169 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1170 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1171 && x264_macroblock_probe_pskip( h ) )
1173 h->mb.i_type = P_SKIP;
1174 x264_analyse_update_cache( h, a );
1175 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1179 m.cost += i_ref_cost;
1180 i_halfpel_thresh += i_ref_cost;
1182 if( m.cost < a->l0.me16x16.cost )
1183 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1185 /* save mv for predicting neighbors */
1186 CP32( a->l0.mvc[i_ref][0], m.mv );
1187 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1190 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1191 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1193 h->mb.i_type = P_L0;
1196 x264_mb_cache_fenc_satd( h );
1197 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1199 h->mb.i_partition = D_16x16;
1200 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1201 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1202 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1203 h->mb.i_type = P_SKIP;
1208 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1212 uint8_t **p_fenc = h->mb.pic.p_fenc;
1213 int i_halfpel_thresh = INT_MAX;
1214 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1216 int i_maxref = h->mb.pic.i_fref[0]-1;
1218 h->mb.i_partition = D_8x8;
1220 #define CHECK_NEIGHBOUR(i)\
1222 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1223 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1227 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1228 * than those used by the neighbors */
1229 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1230 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1233 CHECK_NEIGHBOUR( -8 - 1 );
1234 CHECK_NEIGHBOUR( -8 + 0 );
1235 CHECK_NEIGHBOUR( -8 + 2 );
1236 CHECK_NEIGHBOUR( -8 + 4 );
1237 CHECK_NEIGHBOUR( 0 - 1 );
1238 CHECK_NEIGHBOUR( 2*8 - 1 );
1241 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1242 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1244 for( i = 0; i < 4; i++ )
1246 x264_me_t *l0m = &a->l0.me8x8[i];
1250 m.i_pixel = PIXEL_8x8;
1252 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1253 l0m->cost = INT_MAX;
1254 for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1256 const int i_ref_cost = REF_COST( 0, i_ref );
1257 m.i_ref_cost = i_ref_cost;
1259 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1260 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1262 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1263 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1264 if( h->mb.ref_blind_dupe == i_ref )
1266 CP32( m.mv, a->l0.mvc[0][i+1] );
1267 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1270 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1272 m.cost += i_ref_cost;
1273 i_halfpel_thresh += i_ref_cost;
1274 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1276 if( m.cost < l0m->cost )
1277 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1278 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1279 i_ref = h->mb.ref_blind_dupe;
1283 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1284 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1286 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1287 are effectively zero. */
1288 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1289 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1292 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1293 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1294 /* P_8x8 ref0 has no ref cost */
1295 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1296 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1297 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1298 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1299 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1302 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1304 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1305 * reference frame flags. Thus, if we're not doing mixedrefs, just
1306 * don't bother analysing the dupes. */
1307 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1308 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1309 uint8_t **p_fenc = h->mb.pic.p_fenc;
1311 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1314 /* XXX Needed for x264_mb_predict_mv */
1315 h->mb.i_partition = D_8x8;
1318 CP32( mvc[0], a->l0.me16x16.mv );
1320 for( i = 0; i < 4; i++ )
1322 x264_me_t *m = &a->l0.me8x8[i];
1326 m->i_pixel = PIXEL_8x8;
1327 m->i_ref_cost = i_ref_cost;
1329 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1330 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1331 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1333 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1334 x264_me_search( h, m, mvc, i_mvc );
1336 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1338 CP32( mvc[i_mvc], m->mv );
1342 m->cost += i_ref_cost;
1343 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1344 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1347 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1348 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1349 /* theoretically this should include 4*ref_cost,
1350 * but 3 seems a better approximation of cabac. */
1351 if( h->param.b_cabac )
1352 a->l0.i_cost8x8 -= i_ref_cost;
1353 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1354 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1357 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1360 uint8_t **p_fenc = h->mb.pic.p_fenc;
1361 ALIGNED_4( int16_t mvc[3][2] );
1364 /* XXX Needed for x264_mb_predict_mv */
1365 h->mb.i_partition = D_16x8;
1367 for( i = 0; i < 2; i++ )
1369 x264_me_t *l0m = &a->l0.me16x8[i];
1370 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1371 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1372 const int ref8[2] = { minref, maxref };
1373 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1375 m.i_pixel = PIXEL_16x8;
1377 LOAD_FENC( &m, p_fenc, 0, 8*i );
1378 l0m->cost = INT_MAX;
1379 for( j = 0; j < i_ref8s; j++ )
1381 const int i_ref = ref8[j];
1382 const int i_ref_cost = REF_COST( 0, i_ref );
1383 m.i_ref_cost = i_ref_cost;
1385 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1386 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1387 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1388 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1390 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1391 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1393 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1394 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1395 /* We can only take this shortcut if the first search was performed on ref0. */
1396 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1398 /* We can just leave the MV from the previous ref search. */
1399 x264_me_refine_qpel_refdupe( h, &m, NULL );
1402 x264_me_search( h, &m, mvc, 3 );
1404 m.cost += i_ref_cost;
1406 if( m.cost < l0m->cost )
1407 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1409 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1410 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1413 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1416 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1419 uint8_t **p_fenc = h->mb.pic.p_fenc;
1420 ALIGNED_4( int16_t mvc[3][2] );
1423 /* XXX Needed for x264_mb_predict_mv */
1424 h->mb.i_partition = D_8x16;
1426 for( i = 0; i < 2; i++ )
1428 x264_me_t *l0m = &a->l0.me8x16[i];
1429 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1430 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1431 const int ref8[2] = { minref, maxref };
1432 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1434 m.i_pixel = PIXEL_8x16;
1436 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1437 l0m->cost = INT_MAX;
1438 for( j = 0; j < i_ref8s; j++ )
1440 const int i_ref = ref8[j];
1441 const int i_ref_cost = REF_COST( 0, i_ref );
1442 m.i_ref_cost = i_ref_cost;
1444 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1445 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1446 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1448 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1449 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1451 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1452 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1453 /* We can only take this shortcut if the first search was performed on ref0. */
1454 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1456 /* We can just leave the MV from the previous ref search. */
1457 x264_me_refine_qpel_refdupe( h, &m, NULL );
1460 x264_me_search( h, &m, mvc, 3 );
1462 m.cost += i_ref_cost;
1464 if( m.cost < l0m->cost )
1465 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1467 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1468 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1471 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1474 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1476 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1477 uint8_t *pix2 = pix1+8;
1478 const int i_stride = h->mb.pic.i_stride[1];
1479 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1480 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1481 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1482 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1483 x264_weight_t *weight = h->sh.weight[i_ref];
1485 #define CHROMA4x4MC( width, height, me, x, y ) \
1486 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1487 if( weight[1].weightfn ) \
1488 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1489 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1490 if( weight[2].weightfn ) \
1491 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1494 if( pixel == PIXEL_4x4 )
1496 x264_me_t *m = a->l0.me4x4[i8x8];
1497 CHROMA4x4MC( 2,2, m[0], 0,0 );
1498 CHROMA4x4MC( 2,2, m[1], 2,0 );
1499 CHROMA4x4MC( 2,2, m[2], 0,2 );
1500 CHROMA4x4MC( 2,2, m[3], 2,2 );
1502 else if( pixel == PIXEL_8x4 )
1504 x264_me_t *m = a->l0.me8x4[i8x8];
1505 CHROMA4x4MC( 4,2, m[0], 0,0 );
1506 CHROMA4x4MC( 4,2, m[1], 0,2 );
1510 x264_me_t *m = a->l0.me4x8[i8x8];
1511 CHROMA4x4MC( 2,4, m[0], 0,0 );
1512 CHROMA4x4MC( 2,4, m[1], 2,0 );
1515 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1516 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1519 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1521 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1522 uint8_t **p_fenc = h->mb.pic.p_fenc;
1523 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1526 /* XXX Needed for x264_mb_predict_mv */
1527 h->mb.i_partition = D_8x8;
1529 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1531 const int idx = 4*i8x8 + i4x4;
1532 const int x4 = block_idx_x[idx];
1533 const int y4 = block_idx_y[idx];
1534 const int i_mvc = (i4x4 == 0);
1536 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1538 m->i_pixel = PIXEL_4x4;
1540 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1541 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1542 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1544 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1545 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1547 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1549 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1550 a->l0.me4x4[i8x8][1].cost +
1551 a->l0.me4x4[i8x8][2].cost +
1552 a->l0.me4x4[i8x8][3].cost +
1553 REF_COST( 0, i_ref ) +
1554 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1555 if( h->mb.b_chroma_me )
1556 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1559 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1561 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1562 uint8_t **p_fenc = h->mb.pic.p_fenc;
1563 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1566 /* XXX Needed for x264_mb_predict_mv */
1567 h->mb.i_partition = D_8x8;
1569 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1571 const int idx = 4*i8x8 + 2*i8x4;
1572 const int x4 = block_idx_x[idx];
1573 const int y4 = block_idx_y[idx];
1574 const int i_mvc = (i8x4 == 0);
1576 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1578 m->i_pixel = PIXEL_8x4;
1580 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1581 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1582 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1584 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1585 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1587 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1589 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1590 REF_COST( 0, i_ref ) +
1591 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1592 if( h->mb.b_chroma_me )
1593 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1596 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1598 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1599 uint8_t **p_fenc = h->mb.pic.p_fenc;
1600 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1603 /* XXX Needed for x264_mb_predict_mv */
1604 h->mb.i_partition = D_8x8;
1606 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1608 const int idx = 4*i8x8 + i4x8;
1609 const int x4 = block_idx_x[idx];
1610 const int y4 = block_idx_y[idx];
1611 const int i_mvc = (i4x8 == 0);
1613 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1615 m->i_pixel = PIXEL_4x8;
1617 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1618 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1619 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1621 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1622 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1624 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1626 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1627 REF_COST( 0, i_ref ) +
1628 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1629 if( h->mb.b_chroma_me )
1630 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1633 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1635 /* Assumes that fdec still contains the results of
1636 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1638 uint8_t **p_fenc = h->mb.pic.p_fenc;
1639 uint8_t **p_fdec = h->mb.pic.p_fdec;
1642 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1643 for( i = 0; i < 4; i++ )
1645 const int x = (i&1)*8;
1646 const int y = (i>>1)*8;
1647 a->i_cost16x16direct +=
1648 a->i_cost8x8direct[i] =
1649 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1652 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1656 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1658 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1659 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1660 uint8_t *src0, *src1;
1661 int stride0 = 16, stride1 = 16;
1665 ALIGNED_4( int16_t mvc[9][2] );
1666 int i_halfpel_thresh = INT_MAX;
1667 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1669 /* 16x16 Search on all ref frame */
1670 m.i_pixel = PIXEL_16x16;
1671 m.weight = weight_none;
1673 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1676 a->l0.me16x16.cost = INT_MAX;
1677 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1679 const int i_ref_cost = REF_COST( 0, i_ref );
1680 m.i_ref_cost = i_ref_cost;
1681 /* search with ref */
1682 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1683 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1684 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1685 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1688 m.cost += i_ref_cost;
1690 if( m.cost < a->l0.me16x16.cost )
1692 a->l0.i_ref = i_ref;
1693 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1696 /* save mv for predicting neighbors */
1697 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1699 a->l0.me16x16.i_ref = a->l0.i_ref;
1702 i_halfpel_thresh = INT_MAX;
1703 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1704 a->l1.me16x16.cost = INT_MAX;
1705 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1707 const int i_ref_cost = REF_COST( 0, i_ref );
1708 m.i_ref_cost = i_ref_cost;
1709 /* search with ref */
1710 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1711 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1712 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1713 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1716 m.cost += i_ref_cost;
1718 if( m.cost < a->l1.me16x16.cost )
1720 a->l1.i_ref = i_ref;
1721 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1724 /* save mv for predicting neighbors */
1725 CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1727 a->l1.me16x16.i_ref = a->l1.i_ref;
1729 /* get cost of BI mode */
1730 int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
1731 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1732 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1733 src0 = h->mc.get_ref( pix0, &stride0,
1734 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1735 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1736 src1 = h->mc.get_ref( pix1, &stride1,
1737 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1738 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1740 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1742 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1744 + a->l0.bi16x16.cost_mv
1745 + a->l1.bi16x16.cost_mv;
1748 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1749 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1751 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1752 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1753 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1754 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1755 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
1756 h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
1757 h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1758 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1759 + ref_costs + l0_mv_cost + l1_mv_cost;
1760 if( cost00 < a->i_cost16x16bi )
1762 M32( a->l0.bi16x16.mv ) = 0;
1763 M32( a->l1.bi16x16.mv ) = 0;
1764 a->l0.bi16x16.cost_mv = l0_mv_cost;
1765 a->l1.bi16x16.cost_mv = l1_mv_cost;
1766 a->i_cost16x16bi = cost00;
1771 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1772 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1773 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1776 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1778 const int x = 2*(i%2);
1779 const int y = 2*(i/2);
1781 switch( h->mb.i_sub_partition[i] )
1784 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1787 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1788 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1791 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1792 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1795 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1796 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1797 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1798 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1801 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1806 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1808 const int x = 2*(idx&1);
1809 const int y = 2*(idx>>1);
1810 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1811 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1812 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1813 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1816 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1817 if( x264_mb_partition_listX_table[0][part] ) \
1819 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1820 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1824 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1825 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1827 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1829 if( x264_mb_partition_listX_table[1][part] ) \
1831 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1832 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1836 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1837 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1839 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1842 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1846 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1848 x264_mb_load_mv_direct8x8( h, i );
1851 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1852 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1853 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1858 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1861 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1863 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1865 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1867 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1871 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1873 uint8_t **p_fref[2] =
1874 { h->mb.pic.p_fref[0][a->l0.i_ref],
1875 h->mb.pic.p_fref[1][a->l1.i_ref] };
1876 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1879 /* XXX Needed for x264_mb_predict_mv */
1880 h->mb.i_partition = D_8x8;
1884 for( i = 0; i < 4; i++ )
1889 int i_part_cost_bi = 0;
1890 int stride[2] = {8,8};
1893 for( l = 0; l < 2; l++ )
1895 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1896 const int i_ref_cost = REF_COST( l, lX->i_ref );
1897 x264_me_t *m = &lX->me8x8[i];
1899 m->i_pixel = PIXEL_8x8;
1900 m->i_ref_cost = i_ref_cost;
1902 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1903 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1905 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
1906 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1907 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1908 m->cost += i_ref_cost;
1910 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1913 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1914 m->mv[0], m->mv[1], 8, 8, weight_none );
1915 i_part_cost_bi += m->cost_mv + i_ref_cost;
1917 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1918 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1919 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1920 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1921 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1923 i_part_cost = a->l0.me8x8[i].cost;
1924 h->mb.i_sub_partition[i] = D_L0_8x8;
1925 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1926 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1927 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1928 a->i_cost8x8bi += i_part_cost;
1930 /* XXX Needed for x264_mb_predict_mv */
1931 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1935 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1938 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1940 uint8_t **p_fref[2] =
1941 { h->mb.pic.p_fref[0][a->l0.i_ref],
1942 h->mb.pic.p_fref[1][a->l1.i_ref] };
1943 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1944 ALIGNED_4( int16_t mvc[2][2] );
1947 h->mb.i_partition = D_16x8;
1948 a->i_cost16x8bi = 0;
1950 for( i = 0; i < 2; i++ )
1953 int i_part_cost_bi = 0;
1954 int stride[2] = {16,16};
1957 /* TODO: check only the list(s) that were used in b8x8? */
1958 for( l = 0; l < 2; l++ )
1960 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1961 const int i_ref_cost = REF_COST( l, lX->i_ref );
1962 x264_me_t *m = &lX->me16x8[i];
1964 m->i_pixel = PIXEL_16x8;
1965 m->i_ref_cost = i_ref_cost;
1967 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1968 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1970 CP32( mvc[0], lX->me8x8[2*i].mv );
1971 CP32( mvc[1], lX->me8x8[2*i+1].mv );
1973 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
1974 x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
1975 x264_me_search( h, m, mvc, 2 );
1976 m->cost += i_ref_cost;
1979 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1980 m->mv[0], m->mv[1], 16, 8, weight_none );
1981 i_part_cost_bi += m->cost_mv + i_ref_cost;
1983 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1984 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1986 i_part_cost = a->l0.me16x8[i].cost;
1987 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1988 if( a->l1.me16x8[i].cost < i_part_cost )
1990 i_part_cost = a->l1.me16x8[i].cost;
1991 a->i_mb_partition16x8[i] = D_L1_8x8;
1993 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1995 i_part_cost = i_part_cost_bi;
1996 a->i_mb_partition16x8[i] = D_BI_8x8;
1998 a->i_cost16x8bi += i_part_cost;
2000 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2004 a->i_mb_type16x8 = B_L0_L0
2005 + (a->i_mb_partition16x8[0]>>2) * 3
2006 + (a->i_mb_partition16x8[1]>>2);
2007 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2010 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2012 uint8_t **p_fref[2] =
2013 { h->mb.pic.p_fref[0][a->l0.i_ref],
2014 h->mb.pic.p_fref[1][a->l1.i_ref] };
2015 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2016 ALIGNED_4( int16_t mvc[2][2] );
2019 h->mb.i_partition = D_8x16;
2020 a->i_cost8x16bi = 0;
2022 for( i = 0; i < 2; i++ )
2025 int i_part_cost_bi = 0;
2026 int stride[2] = {8,8};
2029 for( l = 0; l < 2; l++ )
2031 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2032 const int i_ref_cost = REF_COST( l, lX->i_ref );
2033 x264_me_t *m = &lX->me8x16[i];
2035 m->i_pixel = PIXEL_8x16;
2036 m->i_ref_cost = i_ref_cost;
2038 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
2039 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2041 CP32( mvc[0], lX->me8x8[i].mv );
2042 CP32( mvc[1], lX->me8x8[i+2].mv );
2044 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
2045 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2046 x264_me_search( h, m, mvc, 2 );
2047 m->cost += i_ref_cost;
2050 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2051 m->mv[0], m->mv[1], 8, 16, weight_none );
2052 i_part_cost_bi += m->cost_mv + i_ref_cost;
2055 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2056 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2058 i_part_cost = a->l0.me8x16[i].cost;
2059 a->i_mb_partition8x16[i] = D_L0_8x8;
2060 if( a->l1.me8x16[i].cost < i_part_cost )
2062 i_part_cost = a->l1.me8x16[i].cost;
2063 a->i_mb_partition8x16[i] = D_L1_8x8;
2065 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2067 i_part_cost = i_part_cost_bi;
2068 a->i_mb_partition8x16[i] = D_BI_8x8;
2070 a->i_cost8x16bi += i_part_cost;
2072 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2076 a->i_mb_type8x16 = B_L0_L0
2077 + (a->i_mb_partition8x16[0]>>2) * 3
2078 + (a->i_mb_partition8x16[1]>>2);
2079 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2082 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2084 int thresh = i_satd * 5/4;
2086 h->mb.i_type = P_L0;
2087 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2089 h->mb.i_partition = D_16x16;
2090 x264_analyse_update_cache( h, a );
2091 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2094 if( a->l0.i_cost16x8 <= thresh )
2096 h->mb.i_partition = D_16x8;
2097 x264_analyse_update_cache( h, a );
2098 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2101 a->l0.i_cost16x8 = COST_MAX;
2103 if( a->l0.i_cost8x16 <= thresh )
2105 h->mb.i_partition = D_8x16;
2106 x264_analyse_update_cache( h, a );
2107 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2110 a->l0.i_cost8x16 = COST_MAX;
2112 if( a->l0.i_cost8x8 <= thresh )
2114 h->mb.i_type = P_8x8;
2115 h->mb.i_partition = D_8x8;
2116 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2119 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2120 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2121 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2122 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2123 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2124 * for future blocks are those left over from previous RDO calls. */
2125 for( i = 0; i < 4; i++ )
2127 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2128 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2129 int subtype, btype = D_L0_8x8;
2130 uint64_t bcost = COST_MAX64;
2131 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2134 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2136 h->mb.i_sub_partition[i] = subtype;
2137 x264_mb_cache_mv_p8x8( h, a, i );
2138 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2139 COPY2_IF_LT( bcost, cost, btype, subtype );
2141 if( h->mb.i_sub_partition[i] != btype )
2143 h->mb.i_sub_partition[i] = btype;
2144 x264_mb_cache_mv_p8x8( h, a, i );
2149 x264_analyse_update_cache( h, a );
2150 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2153 a->l0.i_cost8x8 = COST_MAX;
2156 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2158 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2160 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2162 h->mb.i_type = B_DIRECT;
2163 /* Assumes direct/skip MC is still in fdec */
2164 /* Requires b-rdo to be done before intra analysis */
2165 h->mb.b_skip_mc = 1;
2166 x264_analyse_update_cache( h, a );
2167 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2168 h->mb.b_skip_mc = 0;
2171 //FIXME not all the update_cache calls are needed
2172 h->mb.i_partition = D_16x16;
2174 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2176 h->mb.i_type = B_L0_L0;
2177 x264_analyse_update_cache( h, a );
2178 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2182 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2184 h->mb.i_type = B_L1_L1;
2185 x264_analyse_update_cache( h, a );
2186 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2190 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2192 h->mb.i_type = B_BI_BI;
2193 x264_analyse_update_cache( h, a );
2194 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2198 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2200 h->mb.i_type = B_8x8;
2201 h->mb.i_partition = D_8x8;
2202 x264_analyse_update_cache( h, a );
2203 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2204 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2208 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2210 h->mb.i_type = a->i_mb_type16x8;
2211 h->mb.i_partition = D_16x8;
2212 x264_analyse_update_cache( h, a );
2213 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2217 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2219 h->mb.i_type = a->i_mb_type8x16;
2220 h->mb.i_partition = D_8x16;
2221 x264_analyse_update_cache( h, a );
2222 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2226 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2228 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2231 if( IS_INTRA(h->mb.i_type) )
2234 switch( h->mb.i_partition )
2237 if( h->mb.i_type == B_BI_BI )
2238 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2241 for( i=0; i<2; i++ )
2242 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2243 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2246 for( i=0; i<2; i++ )
2247 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2248 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2251 for( i=0; i<4; i++ )
2252 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2253 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2258 static inline void x264_mb_analyse_transform( x264_t *h )
2260 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2262 int i_cost4, i_cost8;
2263 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2266 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2267 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2268 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2269 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2271 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2272 h->mb.b_skip_mc = 1;
2276 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2278 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2281 x264_analyse_update_cache( h, a );
2282 h->mb.b_transform_8x8 ^= 1;
2283 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2284 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2286 if( *i_rd >= i_rd8 )
2289 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2293 h->mb.b_transform_8x8 ^= 1;
2297 /* Rate-distortion optimal QP selection.
2298 * FIXME: More than half of the benefit of this function seems to be
2299 * in the way it improves the coding of chroma DC (by decimating or
2300 * finding a better way to code a single DC coefficient.)
2301 * There must be a more efficient way to get that portion of the benefit
2302 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2304 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2306 int bcost, cost, direction, failures, prevcost, origcost;
2307 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2308 int last_qp_tried = 0;
2309 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2310 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2312 /* If CBP is already zero, don't raise the quantizer any higher. */
2313 for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2315 /* Without psy-RD, require monotonicity when moving quant away from previous
2316 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2317 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2318 * allow 2 failures when moving quant towards previous quant.
2319 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2320 int threshold = (!!h->mb.i_psy_rd);
2321 /* Raise the threshold for failures if we're moving towards the last QP. */
2322 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2323 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2325 h->mb.i_qp = orig_qp;
2327 prevcost = origcost;
2329 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2330 * (up to a point) will too. So, jump down to where the threshold will kick in
2331 * and check the QP there. If the CBP is still empty, skip the main loop.
2332 * If it isn't empty, we would have ended up having to check this QP anyways,
2333 * so as long as we store it for later lookup, we lose nothing. */
2334 int already_checked_qp = -1;
2335 int already_checked_cost = COST_MAX;
2336 if( direction == -1 )
2340 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2341 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2342 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2343 if( !h->mb.cbp[h->mb.i_mb_xy] )
2345 /* If our empty-CBP block is lower QP than the last QP,
2346 * the last QP almost surely doesn't have a CBP either. */
2347 if( h->mb.i_last_qp > h->mb.i_qp )
2351 already_checked_qp = h->mb.i_qp;
2352 h->mb.i_qp = orig_qp;
2356 h->mb.i_qp += direction;
2357 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2359 if( h->mb.i_last_qp == h->mb.i_qp )
2361 if( h->mb.i_qp == already_checked_qp )
2362 cost = already_checked_cost;
2365 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2366 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2367 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2370 /* We can't assume that the costs are monotonic over QPs.
2371 * Tie case-as-failure seems to give better results. */
2372 if( cost < prevcost )
2378 if( failures > threshold )
2380 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2382 h->mb.i_qp += direction;
2386 /* Always try the last block's QP. */
2387 if( !last_qp_tried )
2389 h->mb.i_qp = h->mb.i_last_qp;
2390 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2391 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2392 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2396 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2398 /* Check transform again; decision from before may no longer be optimal. */
2399 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2400 x264_mb_transform_8x8_allowed( h ) )
2402 h->mb.b_transform_8x8 ^= 1;
2403 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2405 h->mb.b_transform_8x8 ^= 1;
2409 /*****************************************************************************
2410 * x264_macroblock_analyse:
2411 *****************************************************************************/
2412 void x264_macroblock_analyse( x264_t *h )
2414 x264_mb_analysis_t analysis;
2415 int i_cost = COST_MAX;
2418 h->mb.i_qp = x264_ratecontrol_qp( h );
2419 if( h->param.rc.i_aq_mode )
2421 x264_adaptive_quant( h );
2422 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2423 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2424 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2425 h->mb.i_qp = h->mb.i_last_qp;
2428 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2430 /*--------------------------- Do the analysis ---------------------------*/
2431 if( h->sh.i_type == SLICE_TYPE_I )
2434 if( analysis.i_mbrd )
2435 x264_mb_cache_fenc_satd( h );
2436 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2437 if( analysis.i_mbrd )
2438 x264_intra_rd( h, &analysis, COST_MAX );
2440 i_cost = analysis.i_satd_i16x16;
2441 h->mb.i_type = I_16x16;
2442 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2443 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2444 if( analysis.i_satd_pcm < i_cost )
2445 h->mb.i_type = I_PCM;
2447 else if( analysis.i_mbrd >= 2 )
2448 x264_intra_rd_refine( h, &analysis );
2450 else if( h->sh.i_type == SLICE_TYPE_P )
2454 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2456 analysis.b_try_pskip = 0;
2457 if( analysis.b_force_intra )
2459 if( !h->param.analyse.b_psy )
2461 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2462 goto intra_analysis;
2467 /* Fast P_SKIP detection */
2468 if( h->param.analyse.b_fast_pskip )
2470 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2471 // FIXME don't need to check this if the reference frame is done
2473 else if( h->param.analyse.i_subpel_refine >= 3 )
2474 analysis.b_try_pskip = 1;
2475 else if( h->mb.i_mb_type_left == P_SKIP ||
2476 h->mb.i_mb_type_top == P_SKIP ||
2477 h->mb.i_mb_type_topleft == P_SKIP ||
2478 h->mb.i_mb_type_topright == P_SKIP )
2479 b_skip = x264_macroblock_probe_pskip( h );
2483 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2487 h->mb.i_type = P_SKIP;
2488 h->mb.i_partition = D_16x16;
2489 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2493 const unsigned int flags = h->param.analyse.inter;
2497 int i_satd_inter, i_satd_intra;
2499 x264_mb_analyse_load_costs( h, &analysis );
2501 x264_mb_analyse_inter_p16x16( h, &analysis );
2503 if( h->mb.i_type == P_SKIP )
2506 if( flags & X264_ANALYSE_PSUB16x16 )
2508 if( h->param.analyse.b_mixed_references )
2509 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2511 x264_mb_analyse_inter_p8x8( h, &analysis );
2514 /* Select best inter mode */
2516 i_partition = D_16x16;
2517 i_cost = analysis.l0.me16x16.cost;
2519 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2520 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2523 i_partition = D_8x8;
2524 i_cost = analysis.l0.i_cost8x8;
2527 if( flags & X264_ANALYSE_PSUB8x8 )
2529 for( i = 0; i < 4; i++ )
2531 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2532 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2534 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2535 h->mb.i_sub_partition[i] = D_L0_4x4;
2537 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2538 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2539 h->mb.i_sub_partition[i], D_L0_8x4 );
2541 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2542 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2543 h->mb.i_sub_partition[i], D_L0_4x8 );
2545 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2547 x264_mb_cache_mv_p8x8( h, &analysis, i );
2549 analysis.l0.i_cost8x8 = i_cost;
2553 /* Now do 16x8/8x16 */
2554 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2555 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2556 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2558 x264_mb_analyse_inter_p16x8( h, &analysis );
2559 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2561 x264_mb_analyse_inter_p8x16( h, &analysis );
2562 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2565 h->mb.i_partition = i_partition;
2568 //FIXME mb_type costs?
2569 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2573 else if( i_partition == D_16x16 )
2575 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2576 i_cost = analysis.l0.me16x16.cost;
2578 else if( i_partition == D_16x8 )
2580 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2581 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2582 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2584 else if( i_partition == D_8x16 )
2586 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2587 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2588 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2590 else if( i_partition == D_8x8 )
2594 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2596 switch( h->mb.i_sub_partition[i8x8] )
2599 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2600 i_cost += analysis.l0.me8x8[i8x8].cost;
2603 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2604 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2605 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2606 analysis.l0.me8x4[i8x8][1].cost;
2609 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2610 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2611 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2612 analysis.l0.me4x8[i8x8][1].cost;
2616 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2617 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2618 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2619 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2620 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2621 analysis.l0.me4x4[i8x8][1].cost +
2622 analysis.l0.me4x4[i8x8][2].cost +
2623 analysis.l0.me4x4[i8x8][3].cost;
2626 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2632 if( h->mb.b_chroma_me )
2634 x264_mb_analyse_intra_chroma( h, &analysis );
2635 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2636 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2637 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2638 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2641 x264_mb_analyse_intra( h, &analysis, i_cost );
2643 i_satd_inter = i_cost;
2644 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2645 analysis.i_satd_i8x8,
2646 analysis.i_satd_i4x4 );
2648 if( analysis.i_mbrd )
2650 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2652 i_partition = D_16x16;
2653 i_cost = analysis.l0.i_rd16x16;
2654 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2655 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2656 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2657 h->mb.i_type = i_type;
2658 h->mb.i_partition = i_partition;
2659 if( i_cost < COST_MAX )
2660 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2661 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2664 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2665 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2666 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2667 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2669 h->mb.i_type = i_type;
2671 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2673 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2674 * it was an inter block. */
2675 x264_analyse_update_cache( h, &analysis );
2676 x264_macroblock_encode( h );
2677 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2678 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2679 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2680 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2681 goto intra_analysis;
2684 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2686 if( IS_INTRA( h->mb.i_type ) )
2688 x264_intra_rd_refine( h, &analysis );
2690 else if( i_partition == D_16x16 )
2692 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2693 analysis.l0.me16x16.cost = i_cost;
2694 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2696 else if( i_partition == D_16x8 )
2698 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2699 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2700 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2701 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2702 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2703 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2705 else if( i_partition == D_8x16 )
2707 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2708 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2709 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2710 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2711 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2712 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2714 else if( i_partition == D_8x8 )
2717 x264_analyse_update_cache( h, &analysis );
2718 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2720 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2722 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2724 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2726 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2727 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2729 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2731 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2732 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2734 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2736 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2737 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2738 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2739 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2746 else if( h->sh.i_type == SLICE_TYPE_B )
2748 int i_bskip_cost = COST_MAX;
2751 if( analysis.i_mbrd )
2752 x264_mb_cache_fenc_satd( h );
2754 h->mb.i_type = B_SKIP;
2755 if( h->mb.b_direct_auto_write )
2757 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2758 for( i = 0; i < 2; i++ )
2761 h->sh.b_direct_spatial_mv_pred ^= 1;
2762 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2763 if( analysis.b_direct_available )
2768 b_skip = x264_macroblock_probe_bskip( h );
2770 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2777 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2779 if( analysis.b_direct_available )
2781 if( !h->mb.b_direct_auto_write )
2783 if( analysis.i_mbrd )
2785 i_bskip_cost = ssd_mb( h );
2786 /* 6 = minimum cavlc cost of a non-skipped MB */
2787 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2789 else if( !h->mb.b_direct_auto_write )
2791 /* Conditioning the probe on neighboring block types
2792 * doesn't seem to help speed or quality. */
2793 b_skip = x264_macroblock_probe_bskip( h );
2799 const unsigned int flags = h->param.analyse.inter;
2803 h->mb.b_skip_mc = 0;
2805 x264_mb_analyse_load_costs( h, &analysis );
2807 /* select best inter mode */
2808 /* direct must be first */
2809 if( analysis.b_direct_available )
2810 x264_mb_analyse_inter_direct( h, &analysis );
2812 x264_mb_analyse_inter_b16x16( h, &analysis );
2815 i_partition = D_16x16;
2816 i_cost = analysis.l0.me16x16.cost;
2817 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2818 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2819 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2821 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2823 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2824 if( i_bskip_cost < analysis.i_rd16x16direct &&
2825 i_bskip_cost < analysis.i_rd16x16bi &&
2826 i_bskip_cost < analysis.l0.i_rd16x16 &&
2827 i_bskip_cost < analysis.l1.i_rd16x16 )
2829 h->mb.i_type = B_SKIP;
2830 x264_analyse_update_cache( h, &analysis );
2835 if( flags & X264_ANALYSE_BSUB16x16 )
2837 x264_mb_analyse_inter_b8x8( h, &analysis );
2838 if( analysis.i_cost8x8bi < i_cost )
2841 i_partition = D_8x8;
2842 i_cost = analysis.i_cost8x8bi;
2844 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2845 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2847 x264_mb_analyse_inter_b16x8( h, &analysis );
2848 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2849 i_type, analysis.i_mb_type16x8,
2850 i_partition, D_16x8 );
2852 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2853 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2855 x264_mb_analyse_inter_b8x16( h, &analysis );
2856 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2857 i_type, analysis.i_mb_type8x16,
2858 i_partition, D_8x16 );
2863 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2868 else if( i_partition == D_16x16 )
2870 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2871 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2872 if( i_type == B_L0_L0 )
2874 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2875 i_cost = analysis.l0.me16x16.cost
2876 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2878 else if( i_type == B_L1_L1 )
2880 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2881 i_cost = analysis.l1.me16x16.cost
2882 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2884 else if( i_type == B_BI_BI )
2886 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
2887 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
2890 else if( i_partition == D_16x8 )
2892 for( i=0; i<2; i++ )
2894 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2895 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2896 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2897 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2900 else if( i_partition == D_8x16 )
2902 for( i=0; i<2; i++ )
2904 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2905 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2906 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2907 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2910 else if( i_partition == D_8x8 )
2912 for( i=0; i<4; i++ )
2915 int i_part_cost_old;
2917 int i_part_type = h->mb.i_sub_partition[i];
2918 int b_bidir = (i_part_type == D_BI_8x8);
2920 if( i_part_type == D_DIRECT_8x8 )
2922 if( x264_mb_partition_listX_table[0][i_part_type] )
2924 m = &analysis.l0.me8x8[i];
2925 i_part_cost_old = m->cost;
2926 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2927 m->cost -= i_type_cost;
2928 x264_me_refine_qpel( h, m );
2930 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2932 if( x264_mb_partition_listX_table[1][i_part_type] )
2934 m = &analysis.l1.me8x8[i];
2935 i_part_cost_old = m->cost;
2936 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2937 m->cost -= i_type_cost;
2938 x264_me_refine_qpel( h, m );
2940 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2942 /* TODO: update mvp? */
2946 i_satd_inter = i_cost;
2948 if( analysis.i_mbrd )
2950 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2952 i_cost = i_bskip_cost;
2953 i_partition = D_16x16;
2954 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2955 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2956 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2957 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2958 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2959 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2960 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2962 h->mb.i_type = i_type;
2963 h->mb.i_partition = i_partition;
2966 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2968 if( analysis.i_mbrd )
2970 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2971 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2974 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2975 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2976 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2977 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2979 h->mb.i_type = i_type;
2980 h->mb.i_partition = i_partition;
2982 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2983 x264_intra_rd_refine( h, &analysis );
2984 if( h->mb.i_subpel_refine >= 5 )
2985 x264_refine_bidir( h, &analysis );
2987 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2989 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2990 x264_analyse_update_cache( h, &analysis );
2992 if( i_partition == D_16x16 )
2994 if( i_type == B_L0_L0 )
2996 analysis.l0.me16x16.cost = i_cost;
2997 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2999 else if( i_type == B_L1_L1 )
3001 analysis.l1.me16x16.cost = i_cost;
3002 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3004 else if( i_type == B_BI_BI )
3005 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3007 else if( i_partition == D_16x8 )
3009 for( i = 0; i < 2; i++ )
3011 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3012 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3013 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3014 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3015 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3016 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3017 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3020 else if( i_partition == D_8x16 )
3022 for( i = 0; i < 2; i++ )
3024 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3025 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3026 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3027 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3028 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3029 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3030 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3033 else if( i_partition == D_8x8 )
3035 for( i = 0; i < 4; i++ )
3037 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3038 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3039 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3040 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3041 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3042 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3049 x264_analyse_update_cache( h, &analysis );
3051 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3052 * without realizing it. Check for this and account for it if necessary. */
3053 if( analysis.i_mbrd >= 2 )
3055 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3056 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3057 int list = check_mv_lists[h->mb.i_type] - 1;
3058 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3059 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3060 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3061 h->mb.i_partition = D_16x16;
3064 if( !analysis.i_mbrd )
3065 x264_mb_analyse_transform( h );
3067 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3068 x264_mb_analyse_qp_rd( h, &analysis );
3070 h->mb.b_trellis = h->param.analyse.i_trellis;
3071 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3072 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3073 x264_psy_trellis_init( h, 0 );
3074 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3075 h->mb.i_skip_intra = 0;
3078 /*-------------------- Update MB from the analysis ----------------------*/
3079 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3083 switch( h->mb.i_type )
3086 for( i = 0; i < 16; i++ )
3087 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3089 x264_mb_analyse_intra_chroma( h, a );
3092 for( i = 0; i < 4; i++ )
3093 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3095 x264_mb_analyse_intra_chroma( h, a );
3098 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3099 x264_mb_analyse_intra_chroma( h, a );
3106 switch( h->mb.i_partition )
3109 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3110 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3114 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3115 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3116 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3117 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3121 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3122 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3123 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3124 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3128 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3134 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3135 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3136 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3137 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3138 for( i = 0; i < 4; i++ )
3139 x264_mb_cache_mv_p8x8( h, a, i );
3144 h->mb.i_partition = D_16x16;
3145 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3146 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3152 h->mb.i_partition = h->mb.cache.direct_partition;
3153 x264_mb_load_mv_direct8x8( h, 0 );
3154 x264_mb_load_mv_direct8x8( h, 1 );
3155 x264_mb_load_mv_direct8x8( h, 2 );
3156 x264_mb_load_mv_direct8x8( h, 3 );
3160 /* optimize: cache might not need to be rewritten */
3161 for( i = 0; i < 4; i++ )
3162 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3165 default: /* the rest of the B types */
3166 switch( h->mb.i_partition )
3169 switch( h->mb.i_type )
3172 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3173 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3175 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3176 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3177 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3180 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3181 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3182 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3184 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3185 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3188 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3189 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3191 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3192 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3197 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3198 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3201 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3202 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3205 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3211 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3214 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3217 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3220 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3221 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3223 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3224 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3225 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3226 h->mb.cache.mv[l][x264_scan8[15]][0],
3227 h->mb.cache.mv[l][x264_scan8[15]][1] );
3228 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3229 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3230 fprintf(stderr, "completed: %d \n", completed );
3231 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3232 x264_mb_analyse_intra( h, a, COST_MAX );
3233 h->mb.i_type = I_16x16;
3234 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3235 x264_mb_analyse_intra_chroma( h, a );
3242 #include "slicetype.c"